diff options
Diffstat (limited to 'notmuch-k')
-rwxr-xr-x | notmuch-k | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/notmuch-k b/notmuch-k new file mode 100755 index 0000000..8c1dd8c --- /dev/null +++ b/notmuch-k @@ -0,0 +1,106 @@ +#!/usr/bin/env nix-shell +#! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn]) + +from sklearn import metrics +from sklearn.cluster import KMeans, MiniBatchKMeans +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import HashingVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import Normalizer +from time import time +import json +import numpy as np +import sys +import subprocess +import notmuch + +""" +plan: + +1. fetch all the emails - need to call out to notmuch +2. need to normalize all the inputs? +3. do kmeans, this is actually rather simple + +refs: + +- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py + +get the email contents, will return a list of strings: + + notmuch show --format=json -- from:ben@bsima.me date:today \ + | jq '..|.content?|select(type=="string")' + +""" + + +def get_body(e): + parts = e.get_message_parts() + return ''.join([p.get_payload() for p in parts]) + + +def get_emails_for(date): + db = notmuch.Database() + query = db.create_query(f'not tag:deleted and date:{date}') + emails = query.search_messages() + #contents = {e.get_message_id(): get_body(e) for e in emails} + return [get_body(e) for e in emails] + +DEBUG=True + +def done(*s): + if DEBUG==True: + print(f"done in {time()-t0}s") + if s: [print(s0) for s0 in s] + print() + +def debug(s): + if DEBUG==True: + print(s) + +# mail #################################################################### + +debug("loading mail...") +t0 = time() +dataset = get_emails_for("-3d..today") +# needs to be a "correct" set of tags for the emails, an array of ints where +# each int corresponds to a notmuch tag +labels = [] +true_k = np.unique(labels).shape[0] +done() + +# features #################################################################### + +debug("extracting features...") +t0 = time() +hasher = HashingVectorizer( + n_features=10000, + stop_words='english', + alternate_sign=False, + norm='l2', + binary=False + ) +vectorizor = make_pipeline(hasher, TfidfTransformer()) +# create an array of all the email bodies and vectorize +X = vectorizor.fit_transform(np.array(dataset)) +done(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}") + +debug("dimensionality reduction...") +t0 = time() +svd = TruncatedSVD(5) # FIXME: no idea what number here +normalizer = Normalizer(copy=False) +lsa = make_pipeline(svd, normalizer) +X = lsa.fit_transform(X) +explained_var = svd.explained_variance_ratio_.sum() +done(f"explained variance: {int(explained_var*100)}") + +# clustering ################################################################## + +km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, + verbose=True) + +debug(f"clustering sparse data with {km}") +t0 = time() +km.fit(X) +done() |