1 files changed, 106 insertions, 0 deletions
diff --git a/notmuch-k b/notmuch-k
new file mode 100755
index 0000000..8c1dd8c
--- /dev/null
+++ b/notmuch-k
@@ -0,0 +1,106 @@
+#!/usr/bin/env nix-shell
+#! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn])
+
+from sklearn import metrics
+from sklearn.cluster import KMeans, MiniBatchKMeans
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import Normalizer
+from time import time
+import json
+import numpy as np
+import sys
+import subprocess
+import notmuch
+
+"""
+plan:
+
+1. fetch all the emails - need to call out to notmuch
+2. need to normalize all the inputs?
+3. do kmeans, this is actually rather simple
+
+refs:
+
+- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
+
+get the email contents, will return a list of strings:
+
+   notmuch show --format=json -- from:ben@bsima.me date:today \
+     | jq '..|.content?|select(type=="string")'
+
+"""
+
+
+def get_body(e):
+    parts = e.get_message_parts()
+    return ''.join([p.get_payload() for p in parts])
+
+
+def get_emails_for(date):
+    db = notmuch.Database()
+    query = db.create_query(f'not tag:deleted and date:{date}')
+    emails = query.search_messages()
+    #contents = {e.get_message_id(): get_body(e) for e in emails}
+    return [get_body(e) for e in emails]
+
+DEBUG=True
+
+def done(*s):
+    if DEBUG==True:
+        print(f"done in {time()-t0}s")
+        if s: [print(s0) for s0 in s]
+        print()
+
+def debug(s):
+    if DEBUG==True:
+        print(s)
+
+# mail ####################################################################
+
+debug("loading mail...")
+t0 = time()
+dataset = get_emails_for("-3d..today")
+# needs to be a "correct" set of tags for the emails, an array of ints where
+# each int corresponds to a notmuch tag
+labels = []
+true_k = np.unique(labels).shape[0]
+done()
+
+# features ####################################################################
+
+debug("extracting features...")
+t0 = time()
+hasher = HashingVectorizer(
+    n_features=10000,
+    stop_words='english',
+    alternate_sign=False,
+    norm='l2',
+    binary=False
+    )
+vectorizor = make_pipeline(hasher, TfidfTransformer())
+# create an array of all the email bodies and vectorize
+X = vectorizor.fit_transform(np.array(dataset))
+done(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")
+
+debug("dimensionality reduction...")
+t0 = time()
+svd = TruncatedSVD(5) # FIXME: no idea what number here
+normalizer = Normalizer(copy=False)
+lsa = make_pipeline(svd, normalizer)
+X = lsa.fit_transform(X)
+explained_var = svd.explained_variance_ratio_.sum()
+done(f"explained variance: {int(explained_var*100)}")
+
+# clustering ##################################################################
+
+km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
+            verbose=True)
+
+debug(f"clustering sparse data with {km}")
+t0 = time()
+km.fit(X)
+done()