notmuch-k


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn])

from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from time import time
import json
import numpy as np
import sys
import subprocess
import notmuch

"""
plan:

1. fetch all the emails - need to call out to notmuch
2. need to normalize all the inputs?
3. do kmeans, this is actually rather simple

refs:

- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

get the email contents, will return a list of strings:

   notmuch show --format=json -- from:ben@bsima.me date:today \
     | jq '..|.content?|select(type=="string")'

"""


def get_body(e):
    parts = e.get_message_parts()
    return ''.join([p.get_payload() for p in parts])


def get_emails_for(date):
    db = notmuch.Database()
    query = db.create_query(f'not tag:deleted and date:{date}')
    emails = query.search_messages()
    #contents = {e.get_message_id(): get_body(e) for e in emails}
    return [get_body(e) for e in emails]

DEBUG=True

def done(*s):
    if DEBUG==True:
        print(f"done in {time()-t0}s")
        if s: [print(s0) for s0 in s]
        print()

def debug(s):
    if DEBUG==True:
        print(s)

# mail ####################################################################

debug("loading mail...")
t0 = time()
dataset = get_emails_for("-3d..today")
# needs to be a "correct" set of tags for the emails, an array of ints where
# each int corresponds to a notmuch tag
labels = []
true_k = np.unique(labels).shape[0]
done()

# features ####################################################################

debug("extracting features...")
t0 = time()
hasher = HashingVectorizer(
    n_features=10000,
    stop_words='english',
    alternate_sign=False,
    norm='l2',
    binary=False
    )
vectorizor = make_pipeline(hasher, TfidfTransformer())
# create an array of all the email bodies and vectorize
X = vectorizor.fit_transform(np.array(dataset))
done(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")

debug("dimensionality reduction...")
t0 = time()
svd = TruncatedSVD(5) # FIXME: no idea what number here
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
explained_var = svd.explained_variance_ratio_.sum()
done(f"explained variance: {int(explained_var*100)}")

# clustering ##################################################################

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
            verbose=True)

debug(f"clustering sparse data with {km}")
t0 = time()
km.fit(X)
done()