#!/usr/bin/env nix-shell #! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn]) from sklearn import metrics from sklearn.cluster import KMeans, MiniBatchKMeans from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from time import time import json import numpy as np import sys import subprocess import notmuch """ plan: 1. fetch all the emails - need to call out to notmuch 2. need to normalize all the inputs? 3. do kmeans, this is actually rather simple refs: - https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py get the email contents, will return a list of strings: notmuch show --format=json -- from:ben@bsima.me date:today \ | jq '..|.content?|select(type=="string")' """ def get_body(e): parts = e.get_message_parts() return ''.join([p.get_payload() for p in parts]) def get_emails_for(date): db = notmuch.Database() query = db.create_query(f'not tag:deleted and date:{date}') emails = query.search_messages() #contents = {e.get_message_id(): get_body(e) for e in emails} return [get_body(e) for e in emails] DEBUG=True def done(*s): if DEBUG==True: print(f"done in {time()-t0}s") if s: [print(s0) for s0 in s] print() def debug(s): if DEBUG==True: print(s) # mail #################################################################### debug("loading mail...") t0 = time() dataset = get_emails_for("-3d..today") # needs to be a "correct" set of tags for the emails, an array of ints where # each int corresponds to a notmuch tag labels = [] true_k = np.unique(labels).shape[0] done() # features #################################################################### debug("extracting features...") t0 = time() hasher = HashingVectorizer( n_features=10000, stop_words='english', alternate_sign=False, norm='l2', binary=False ) vectorizor = make_pipeline(hasher, TfidfTransformer()) # create an array of all the email bodies and vectorize X = vectorizor.fit_transform(np.array(dataset)) done(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}") debug("dimensionality reduction...") t0 = time() svd = TruncatedSVD(5) # FIXME: no idea what number here normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) explained_var = svd.explained_variance_ratio_.sum() done(f"explained variance: {int(explained_var*100)}") # clustering ################################################################## km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True) debug(f"clustering sparse data with {km}") t0 = time() km.fit(X) done()