1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn])
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from time import time
import json
import numpy as np
import sys
import subprocess
import notmuch
"""
plan:
1. fetch all the emails - need to call out to notmuch
2. need to normalize all the inputs?
3. do kmeans, this is actually rather simple
refs:
- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
get the email contents, will return a list of strings:
notmuch show --format=json -- from:ben@bsima.me date:today \
| jq '..|.content?|select(type=="string")'
"""
def get_body(e):
parts = e.get_message_parts()
return ''.join([p.get_payload() for p in parts])
def get_emails_for(date):
db = notmuch.Database()
query = db.create_query(f'not tag:deleted and date:{date}')
emails = query.search_messages()
#contents = {e.get_message_id(): get_body(e) for e in emails}
return [get_body(e) for e in emails]
DEBUG=True
def done(*s):
if DEBUG==True:
print(f"done in {time()-t0}s")
if s: [print(s0) for s0 in s]
print()
def debug(s):
if DEBUG==True:
print(s)
# mail ####################################################################
debug("loading mail...")
t0 = time()
dataset = get_emails_for("-3d..today")
# needs to be a "correct" set of tags for the emails, an array of ints where
# each int corresponds to a notmuch tag
labels = []
true_k = np.unique(labels).shape[0]
done()
# features ####################################################################
debug("extracting features...")
t0 = time()
hasher = HashingVectorizer(
n_features=10000,
stop_words='english',
alternate_sign=False,
norm='l2',
binary=False
)
vectorizor = make_pipeline(hasher, TfidfTransformer())
# create an array of all the email bodies and vectorize
X = vectorizor.fit_transform(np.array(dataset))
done(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")
debug("dimensionality reduction...")
t0 = time()
svd = TruncatedSVD(5) # FIXME: no idea what number here
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
explained_var = svd.explained_variance_ratio_.sum()
done(f"explained variance: {int(explained_var*100)}")
# clustering ##################################################################
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
verbose=True)
debug(f"clustering sparse data with {km}")
t0 = time()
km.fit(X)
done()
|