notmuch-autotag: some updates with docs

author: Ben Sima <ben@bsima.me> 2019-05-13 13:50:08 -0700
committer: Ben Sima <ben@bsima.me> 2019-05-13 13:51:06 -0700
commit: fc7dfe1ad145ab57e59890ec1351d9a05f1860ec (patch)
tree: f52da6dc823e6809e31952c547975255cf34a5e4 /notmuch-k
parent: 78971e60fa76e7717384df9965a8232d39a76501 (diff)
1 files changed, 19 insertions, 25 deletions
diff --git a/notmuch-k b/notmuch-k
index 8c1dd8c..04fe290 100755
--- a/notmuch-k
+++ b/notmuch-k
@@ -1,5 +1,17 @@
 #!/usr/bin/env nix-shell
 #! nix-shell -i python3 -p "python37.withPackages(ps:[ps.notmuch ps.scikitlearn])
+"""
+plan:
+
+1. fetch all the emails - uses notmuch python library
+2. need to normalize all the inputs?
+3. train a naive bayes classifier and classify emails
+4. apply infered tags
+
+refs:
+
+- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
+"""
 
 from sklearn import metrics
 from sklearn.cluster import KMeans, MiniBatchKMeans
@@ -16,30 +28,12 @@ import sys
 import subprocess
 import notmuch
 
-"""
-plan:
-
-1. fetch all the emails - need to call out to notmuch
-2. need to normalize all the inputs?
-3. do kmeans, this is actually rather simple
-
-refs:
-
-- https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py
-
-get the email contents, will return a list of strings:
-
-   notmuch show --format=json -- from:ben@bsima.me date:today \
-     | jq '..|.content?|select(type=="string")'
-
-"""
-
-
 def get_body(e):
+    """Takes an email message object and returns the body content as a single
+string."""
     parts = e.get_message_parts()
     return ''.join([p.get_payload() for p in parts])
 
-
 def get_emails_for(date):
     db = notmuch.Database()
     query = db.create_query(f'not tag:deleted and date:{date}')
@@ -47,12 +41,12 @@ def get_emails_for(date):
     #contents = {e.get_message_id(): get_body(e) for e in emails}
     return [get_body(e) for e in emails]
 
-DEBUG=True
+DEBUG=os.getenv("DEBUG", True)
 
-def done(*s):
+def done(*msgs):
     if DEBUG==True:
         print(f"done in {time()-t0}s")
-        if s: [print(s0) for s0 in s]
+        if msgs: [print(msg) for msg in msgs]
         print()
 
 def debug(s):
@@ -64,8 +58,8 @@ def debug(s):
 debug("loading mail...")
 t0 = time()
 dataset = get_emails_for("-3d..today")
-# needs to be a "correct" set of tags for the emails, an array of ints where
-# each int corresponds to a notmuch tag
+# TODO: needs to be a "correct" set of tags for the emails, an array of ints
+# where each int corresponds to a notmuch tag
 labels = []
 true_k = np.unique(labels).shape[0]
 done()
author	Ben Sima <ben@bsima.me>	2019-05-13 13:50:08 -0700
committer	Ben Sima <ben@bsima.me>	2019-05-13 13:51:06 -0700
commit	fc7dfe1ad145ab57e59890ec1351d9a05f1860ec (patch)
tree	f52da6dc823e6809e31952c547975255cf34a5e4 /notmuch-k
parent	78971e60fa76e7717384df9965a8232d39a76501 (diff)