diff options
author | Ben Sima <ben@bsima.me> | 2020-03-06 08:54:27 -0800 |
---|---|---|
committer | Ben Sima <ben@bsima.me> | 2020-03-06 08:54:27 -0800 |
commit | dda68b4bebb5700de6a7dd3c22fbaf8a0de1d211 (patch) | |
tree | 90612cda0992b0195450a05722f918765166f545 /scrape | |
parent | 3264d2d492a8b4a47109cb3620d46df291b44615 (diff) |
refactor/formatting
Diffstat (limited to 'scrape')
-rwxr-xr-x | scrape | 26 |
1 files changed, 16 insertions, 10 deletions
@@ -1,10 +1,15 @@ #!/usr/bin/env python3 +""" +Scrape a webpage for emails or links, or both. +""" + import argparse import re import sys import urllib.request + def get(url): page = urllib.request.urlopen(url) byts = page.read() @@ -12,19 +17,20 @@ def get(url): page.close() return s + def extract_links(content): - links = re.findall('"((http|ftp)s?://.*?)"', content) - return links + return re.findall('"((http|ftp)s?://.*?)"', content) + def extract_emails(content): - emails = re.findall('([\w\.,]+@[\w\.,]+\.\w+)', content) - return emails - -if __name__ == '__main__': - cli = argparse.ArgumentParser(description='Scrape a web page for stuff.') - cli.add_argument('url', metavar='URL', type=str, help='url to fetch') - cli.add_argument('-e', '--emails', action="store_true", help='get emails') - cli.add_argument('-l', '--links', action="store_true", help='get links') + return re.findall("([\w\.,]+@[\w\.,]+\.\w+)", content) + + +if __name__ == "__main__": + cli = argparse.ArgumentParser(description=__doc__) + cli.add_argument("url", metavar="URL", type=str, help="url to fetch") + cli.add_argument("-e", "--emails", action="store_true", help="get emails") + cli.add_argument("-l", "--links", action="store_true", help="get links") args = cli.parse_args() if not (args.emails or args.links): |