summaryrefslogtreecommitdiff
path: root/notmuch-k
diff options
context:
space:
mode:
authorBen Sima <ben@bsima.me>2019-05-13 13:50:08 -0700
committerBen Sima <ben@bsima.me>2019-05-13 13:51:06 -0700
commitfc7dfe1ad145ab57e59890ec1351d9a05f1860ec (patch)
treef52da6dc823e6809e31952c547975255cf34a5e4 /notmuch-k
parent78971e60fa76e7717384df9965a8232d39a76501 (diff)
notmuch-autotag: some updates with docs
Diffstat (limited to 'notmuch-k')
-rwxr-xr-xnotmuch-k44
1 files changed, 19 insertions, 25 deletions
diff --git a/notmuch-k b/notmuch-k
index 8c1dd8c..04fe290 100755
--- a/notmuch-k
+++ b/notmuch-k
@@ -1,5 +1,17 @@
#!/usr/bin/env nix-shell
#! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn])
+"""
+plan:
+
+1. fetch all the emails - uses notmuch python library
+2. need to normalize all the inputs?
+3. train a naive bayes classifier and classify emails
+4. apply infered tags
+
+refs:
+
+- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
+"""
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
@@ -16,30 +28,12 @@ import sys
import subprocess
import notmuch
-"""
-plan:
-
-1. fetch all the emails - need to call out to notmuch
-2. need to normalize all the inputs?
-3. do kmeans, this is actually rather simple
-
-refs:
-
-- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
-
-get the email contents, will return a list of strings:
-
- notmuch show --format=json -- from:ben@bsima.me date:today \
- | jq '..|.content?|select(type=="string")'
-
-"""
-
-
def get_body(e):
+ """Takes an email message object and returns the body content as a single
+string."""
parts = e.get_message_parts()
return ''.join([p.get_payload() for p in parts])
-
def get_emails_for(date):
db = notmuch.Database()
query = db.create_query(f'not tag:deleted and date:{date}')
@@ -47,12 +41,12 @@ def get_emails_for(date):
#contents = {e.get_message_id(): get_body(e) for e in emails}
return [get_body(e) for e in emails]
-DEBUG=True
+DEBUG=os.getenv("DEBUG", True)
-def done(*s):
+def done(*msgs):
if DEBUG==True:
print(f"done in {time()-t0}s")
- if s: [print(s0) for s0 in s]
+ if msgs: [print(msg) for msg in msgs]
print()
def debug(s):
@@ -64,8 +58,8 @@ def debug(s):
debug("loading mail...")
t0 = time()
dataset = get_emails_for("-3d..today")
-# needs to be a "correct" set of tags for the emails, an array of ints where
-# each int corresponds to a notmuch tag
+# TODO: needs to be a "correct" set of tags for the emails, an array of ints
+# where each int corresponds to a notmuch tag
labels = []
true_k = np.unique(labels).shape[0]
done()