Mining the Social Web

Mining Mailboxes

This Jupyter Notebook provides an interactive way to follow along with and explore the examples from the video series. The intent behind this notebook is to reinforce the concepts in a fun, convenient, and effective way.

Converting a toy mailbox to JSON

In [ ]:
import mailbox # pip install mailbox
import json
In [ ]:
MBOX = 'resources/ch07-mailboxes/data/northpole.mbox'
In [ ]:
# A routine that makes a ton of simplifying assumptions
# about converting an mbox message into a Python object
# given the nature of the northpole.mbox file in order
# to demonstrate the basic parsing of an mbox with mail
# utilities

def objectify_message(msg):
    
    # Map in fields from the message
    o_msg = dict([ (k, v) for (k,v) in msg.items() ])
    
    # Assume one part to the message and get its content
    # and its content type
    
    part = [p for p in msg.walk()][0]
    o_msg['contentType'] = part.get_content_type()
    o_msg['content'] = part.get_payload()
    
    return o_msg
In [ ]:
# Create an mbox that can be iterated over and transform each of its
# messages to a convenient JSON representation

mbox = mailbox.mbox(MBOX)

messages = []

for msg in mbox:
    messages.append(objectify_message(msg))
    
print(json.dumps(messages, indent=1))

Downloading the Enron email corpus

In [ ]:
import sys
from urllib.request import urlopen
import time
import os
import envoy # pip install envoy

URL = "http://www.cs.cmu.edu/~enron/enron_mail_20110402.tgz"
DOWNLOAD_DIR = "resources/ch07-mailboxes/data"

# Downloads a file and displays a download status every 5 seconds

def download(url, download_dir):    
    file_name = url.split('/')[-1]
    u = urlopen(url)
    f = open(os.path.join(download_dir, file_name), 'wb')
    meta = u.info()
    file_size = int(meta['Content-Length'])
    print("Downloading: %s Bytes: %s" % (file_name, file_size))

    file_size_dl = 0
    block_sz = 8192
    last_update = time.time()
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        download_status = r"%10d MB  [%5.2f%%]" % (file_size_dl / 1000000.0, file_size_dl * 100.0 / file_size)
        download_status = download_status + chr(8)*(len(download_status)+1)
        if time.time() - last_update > 5:
            print(download_status)
            sys.stdout.flush()
            last_update = time.time()
    f.close()
    return f.name

# Extracts a gzipped tarfile. e.g. "$ tar xzf filename.tgz"

def tar_xzf(f):
    # Call out to the shell for a faster decompression.
    # This will still take a while because Vagrant synchronizes
    # thousands of files that are extracted to the host machine
    r = envoy.run("tar xzf %s -C %s" % (f, DOWNLOAD_DIR))
    print(r.std_out)
    print(r.std_err)

f = download(URL, DOWNLOAD_DIR)
print("Download complete: %s" % (f,))
tar_xzf(f)
print("Decompression complete")
print("Data is ready")

Converting the Enron corpus to a standardized mbox format

The results of the sample code below have been saved as a file, enron.mbox.bz2, in a compressed format. You may decompress is to enron.mbox using whatever tool you prefer, appropriate to your computer's operating system. On UNIX-like systems, the file may be decompressed with the command:

tar -xjf enron.mbox.bz2

In [ ]:
import re
import email
from time import asctime
import os
import sys
from dateutil.parser import parse # pip install python_dateutil

# XXX: Download the Enron corpus to resources/ch07-mailboxes/data
# and unarchive it there.

MAILDIR = 'resources/ch07-mailboxes/data/enron_mail_20110402/maildir' 

# Where to write the converted mbox
MBOX = 'resources/ch07-mailboxes/data/enron.mbox'

# Create a file handle that we'll be writing into...
mbox = open(MBOX, 'w+')

# Walk the directories and process any folder named 'inbox'

for (root, dirs, file_names) in os.walk(MAILDIR):

    if root.split(os.sep)[-1].lower() != 'inbox':
        continue

    # Process each message in 'inbox'

    for file_name in file_names:
        file_path = os.path.join(root, file_name)
        message_text = open(file_path, errors='ignore').read()

        # Compute fields for the From_ line in a traditional mbox message
        _from = re.search(r"From: ([^\r\n]+)", message_text).groups()[0]
        _date = re.search(r"Date: ([^\r\n]+)", message_text).groups()[0]

        # Convert _date to the asctime representation for the From_ line
        _date = asctime(parse(_date).timetuple())

        msg = email.message_from_string(message_text)
        msg.set_unixfrom('From {0} {1}'.format(_from, _date))

        mbox.write(msg.as_string(unixfrom=True) + "\n\n")
    
mbox.close()

Loading the mailbox data into Pandas

In [ ]:
import pandas as pd # pip install pandas
import mailbox

MBOX = 'resources/ch07-mailboxes/data/enron.mbox'
mbox = mailbox.mbox(MBOX)

mbox_dict = {}
for i, msg in enumerate(mbox):
    mbox_dict[i] = {}
    for header in msg.keys():
        mbox_dict[i][header] = msg[header]
    mbox_dict[i]['Body'] = msg.get_payload().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()
    
df = pd.DataFrame.from_dict(mbox_dict, orient='index')
In [ ]:
df.head()
In [ ]:
df.index = df['Date'].apply(pd.to_datetime)

# Remove non-essential columns
cols_to_keep = ['From', 'To', 'Cc', 'Bcc', 'Subject', 'Body']
df = df[cols_to_keep]
In [ ]:
df.head()

Describe the DataFrame

In [ ]:
df.describe()

Investigate email volume by month

In [ ]:
start_date = '2000-1-1'
stop_date = '2003-1-1'

datemask = (df.index > start_date) & (df.index <= stop_date)
vol_by_month = df.loc[datemask].resample('1M').count()['To']

print(vol_by_month)
In [ ]:
from prettytable import PrettyTable

pt = PrettyTable(field_names=['Year', 'Month', 'Num Msgs'])
pt.align['Num Msgs'], pt.align['Month'] = 'r', 'r'
[ pt.add_row([ind.year, ind.month, vol])
  for ind, vol in zip(vol_by_month.index, vol_by_month)]

print(pt)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

vol_by_month[::-1].plot(kind='barh', figsize=(5,8), title='Email Volume by Month')

Analyzing Patterns in Sender/Recipient Communications

In [ ]:
senders = df['From'].unique()
receivers = df['To'].unique()
cc_receivers = df['Cc'].unique()
bcc_receivers = df['Bcc'].unique()

print('Num Senders:', len(senders))
print('Num Receivers:', len(receivers))
print('Num CC Receivers:', len(cc_receivers))
print('Num BCC Receivers:', len(bcc_receivers))
In [ ]:
senders = set(senders)
receivers = set(receivers)
cc_receivers = set(cc_receivers)
bcc_receivers = set(bcc_receivers)

# Find the number of senders who were also direct receivers

senders_intersect_receivers = senders.intersection(receivers)

# Find the senders that didn't receive any messages

senders_diff_receivers = senders.difference(receivers)
                                           
# Find the receivers that didn't send any messages

receivers_diff_senders = receivers.difference(senders)

# Find the senders who were any kind of receiver by
# first computing the union of all types of receivers

all_receivers = receivers.union(cc_receivers, bcc_receivers)
senders_all_receivers = senders.intersection(all_receivers)

print("Num senders in common with receivers:", len(senders_intersect_receivers))
print("Num senders who didn't receive:", len(senders_diff_receivers))
print("Num receivers who didn't send:", len(receivers_diff_senders))
print("Num senders in common with *all* receivers:", len(senders_all_receivers))

Who is Sending and Receiving the Most Email?

In [ ]:
import numpy as np

top_senders = df.groupby('From')
top_receivers = df.groupby('To')

top_senders = top_senders.count()['To']
top_receivers = top_receivers.count()['From']

# Get the ordered indices of the top senders and receivers in descending order
top_snd_ord = np.argsort(top_senders)[::-1]
top_rcv_ord = np.argsort(top_receivers)[::-1]

top_senders = top_senders[top_snd_ord]
top_receivers = top_receivers[top_rcv_ord]
In [ ]:
from prettytable import PrettyTable

top10 = top_senders[:10]
pt = PrettyTable(field_names=['Rank', 'Sender', 'Messages Sent'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]

print(pt)
In [ ]:
from prettytable import PrettyTable

top10 = top_receivers[:10]
pt = PrettyTable(field_names=['Rank', 'Receiver', 'Messages Received'])
pt.align['Messages Sent'] = 'r'
[ pt.add_row([i+1, email, vol]) for i, email, vol in zip(range(10), top10.index.values, top10.values)]

print(pt)

Searching by keyword

In [ ]:
import textwrap

search_term = 'raptor'

query = (df['Body'].str.contains(search_term, case=False) | df['Subject'].str.contains(search_term, case=False))

results = df[query]

print('{0} results found.'.format(query.sum()))
print('Printing first 10 results...')
for i in range(10):
    subject, body = results.iloc[i]['Subject'], results.iloc[i]['Body']
    print()
    print('SUBJECT: ', subject)
    print('-'*20)
    for line in textwrap.wrap(body, width=70, max_lines=5):
        print(line)

Accessing Your Gmail Programmatically

In [ ]:
import httplib2
import os

from apiclient import discovery
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/gmail-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'Gmail API Python Quickstart'
In [ ]:
def get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'gmail-python-quickstart.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials
In [ ]:
credentials = get_credentials()
http = credentials.authorize(httplib2.Http())
service = discovery.build('gmail', 'v1', http=http)

results = service.users().labels().list(userId='me').execute()
labels = results.get('labels', [])

if not labels:
    print('No labels found.')
else:
    print('Labels:')
    for label in labels:
        print(label['name'])

Fetch Gmail Messages

In [ ]:
query = 'Mining'
max_results = 10

# Search for Gmail messages containing the query term
results = service.users().messages().list(userId='me', q=query, maxResults=max_results).execute()

for result in results['messages']:
    print(result['id'])
    # Retrieve the message itself
    msg = service.users().messages().get(userId='me', id=result['id'], format='minimal').execute()
    print(msg)