From dda68b4bebb5700de6a7dd3c22fbaf8a0de1d211 Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Fri, 6 Mar 2020 08:54:27 -0800 Subject: refactor/formatting --- scrape | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'scrape') diff --git a/scrape b/scrape index 7da2125..fde6b2b 100755 --- a/scrape +++ b/scrape @@ -1,10 +1,15 @@ #!/usr/bin/env python3 +""" +Scrape a webpage for emails or links, or both. +""" + import argparse import re import sys import urllib.request + def get(url): page = urllib.request.urlopen(url) byts = page.read() @@ -12,19 +17,20 @@ def get(url): page.close() return s + def extract_links(content): - links = re.findall('"((http|ftp)s?://.*?)"', content) - return links + return re.findall('"((http|ftp)s?://.*?)"', content) + def extract_emails(content): - emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', content) - return emails - -if __name__ == '__main__': - cli = argparse.ArgumentParser(description='Scrape a web page for stuff.') - cli.add_argument('url', metavar='URL', type=str, help='url to fetch') - cli.add_argument('-e', '--emails', action="store_true", help='get emails') - cli.add_argument('-l', '--links', action="store_true", help='get links') + return re.findall("([\w\.,]+@[\w\.,]+\.\w+)", content) + + +if __name__ == "__main__": + cli = argparse.ArgumentParser(description=__doc__) + cli.add_argument("url", metavar="URL", type=str, help="url to fetch") + cli.add_argument("-e", "--emails", action="store_true", help="get emails") + cli.add_argument("-l", "--links", action="store_true", help="get links") args = cli.parse_args() if not (args.emails or args.links): -- cgit v1.2.3