# HG changeset patch # User paulo # Date 1590304925 25200 # Node ID cffd95813b82297bf30157cae36613492d70dad9 # Parent 24a967efbf3edd7ae173cb822a206b43b6310d30 add myrss2 diff -r 24a967efbf3e -r cffd95813b82 myrss2/.dockerignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/.dockerignore Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,6 @@ +Dockerfile +README.md +*.pyc +*.pyo +*.pyd +__pycache__ \ No newline at end of file diff -r 24a967efbf3e -r cffd95813b82 myrss2/Dockerfile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/Dockerfile Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,17 @@ +# Use the official lightweight Python image. +# https://hub.docker.com/_/python +FROM python:3.6-slim + +# Copy local code to the container image. +ENV APP_HOME /app +WORKDIR $APP_HOME +COPY . ./ + +# Install production dependencies. +RUN pip install -r requirements.pip + +# Run the web service on container startup. Here we use the gunicorn +# webserver, with one worker process and 8 threads. +# For environments with multiple CPU cores, increase the number of workers +# to be equal to the cores available. +CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 myrss_flask:flask_app diff -r 24a967efbf3e -r cffd95813b82 myrss2/FEEDS --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/FEEDS Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,43 @@ +https://news.google.com/news/rss/?gl=US&ned=us&hl=en +https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en +http://ep01.epimg.net/rss/elpais/portada.xml +http://www.sfgate.com/rss/feeds/news_pageone.xml +http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php +http://www.weatherwest.com/feed +http://sports.yahoo.com/top/rss.xml +http://rss.slashdot.org/Slashdot/slashdot +http://www.theverge.com/rss/index.xml +http://www.reddit.com/.rss +http://feeds.feedburner.com/Metafilter +http://feeds.feedburner.com/AskMetafilter +http://www.vice.com/rss +http://feeds.slate.com/slate +http://feeds.feedburner.com/TheAtlantic +http://aeon.co/magazine/feed/ +http://nautil.us/rss/all +https://api.quantamagazine.org/feed/ +http://undark.org/feed/ +http://priceonomics.com/latest.rss +https://harpers.org/feed/ +http://roadsandkingdoms.com/feed/ +http://thebaffler.com/feed +http://www.edge.org/feed +http://quillette.com/feed/ +http://thebrowser.com/feed +http://longform.org/feed/ +http://longreads.com/rss/ +https://www.project-syndicate.org/rss +http://lifehacker.com/rss +http://bikesnobnyc.blogspot.com/feeds/posts/default +http://feeds.feedburner.com/bikehugger +http://jalopnik.com/rss +https://www.rideapart.com/rss/articles/all/ +http://feeds2.feedburner.com/Bikeexif +http://feeds2.feedburner.com/Cycleexif +http://www.xkcd.com/rss.xml +http://feeds.kottke.org/main +http://feeds.feedburner.com/OpenCulture +http://feeds.feedburner.com/shorpy?q=rss.xml +http://feeds.feedburner.com/codinghorror +https://danielmiessler.com/feed/ +http://syndication.thedailywtf.com/TheDailyWtf diff -r 24a967efbf3e -r cffd95813b82 myrss2/myrss_app.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/myrss_app.py Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,299 @@ +import io +import os +import sys +import re +import urllib.request +import urllib.error +import threading +import queue +import datetime +import time +import traceback + +import logging +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") +logging.basicConfig( + level=getattr(logging, LOG_LEVEL), + format="%(asctime)s %(levelname)-8s %(message)s", +) + +import xml.etree.ElementTree +import html + +from html3.html3 import HTML + + +FEEDS_FILE = "FEEDS" +CACHE_HTML_FILE = "__cache__.html" + +CACHE_LIFE = 1200 # [seconds] +MAX_ITEMS = 50 +MAX_LINK_Z = 4 +MAX_THREADS = 20 +URLOPEN_TIMEOUT = 10 # [seconds] + + +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") + +def _parse_root_tag(root_tag): + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) + + if re_match is None: + return (None, None) + else: + return re_match.group(2, 3) + + +def _strip_if_not_none(txt): + return txt.strip() if txt is not None else '' + + +def _go_rss(elementTree): + title = _strip_if_not_none(elementTree.find("channel/title").text) + link = elementTree.find("channel/link").text + + items = [] + + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: + it_title = _strip_if_not_none(i.find("title").text) + it_link = i.find("link").text + + items.append((it_title, it_link)) + + return (title, link, items) + + +def _go_atom(elementTree): + ns = "http://www.w3.org/2005/Atom" + + title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) + link = '' + + links = elementTree.findall("{%s}link" % ns) + for i in links: + if len(links) == 1 or i.get("rel") == "alternate": + link = i.get("href") + break + + items = [] + + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) + it_link = '' + + it_links = i.findall("{%s}link" % ns) + for j in it_links: + if len(it_links) == 1 or j.get("rel") == "alternate": + it_link = j.get("href") + break + + items.append((it_title, it_link)) + + return (title, link, items) + + +def _go_purl_rss(elementTree): + ns = "http://purl.org/rss/1.0/" + + title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text) + link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text + + items = [] + + for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]: + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) + it_link = i.find("{%s}link" % ns).text + + items.append((it_title, it_link)) + + return (title, link, items) + + +_STRIP_HTML_RE = re.compile(r"<.*?>") + +def _strip_html(txt): + return html.unescape(_STRIP_HTML_RE.sub('', txt)) + + +def _to_html(dtnow, docstruct): + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") + page_title = "myrss -- %s" % datetime_str + + root = HTML("html") + + header = root.head + header.meta(name="viewport", content="width=device-width, initial-scale=1") + header.title(page_title) + header.link(rel="stylesheet", type="text/css", href="static/index.css") + + body = root.body + body.h1(page_title) + + link_z = 0 + + for feed in docstruct: + if feed is None: + continue + + (title, link, items) = feed + + logging.debug("title: %s", title) + body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) + link_z += 1 + p = body.p + + for (i, (it_title, it_link)) in enumerate(items): + if i > 0: + p += " - " + + if not it_title: + it_title = "(missing title)" + if it_link is not None: + p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) + else: + p += _strip_html(it_title) + + link_z += 1 + + dtdelta = datetime.datetime.now() - dtnow + root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") + + return str(root) + + +def _fetch_url(url): + try: + logging.info("processing %s" % url) + feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) + except urllib.error.HTTPError as e: + logging.info("(%s) %s" % (url, e)) + return None + + return str(feed.read(), encoding="utf-8") + + +def _filter_feed(feed): + ret = feed + + filter_out = ["\x16"] + for i in filter_out: + ret = ret.replace(i, "") + + return ret + + +def _process_feed(feed): + ret = None + + feed_sio = io.StringIO(feed) + elementTree = xml.etree.ElementTree.parse(feed_sio) + root = elementTree.getroot() + + parsed_root_tag = _parse_root_tag(root.tag) + + if parsed_root_tag == (None, "rss"): + version = float(root.get("version", 0.0)) + if version >= 2.0: + ret = _go_rss(elementTree) + else: + raise NotImplementedError("Unsupported rss version") + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): + ret = _go_atom(elementTree) + elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"): + ret = _go_purl_rss(elementTree) + else: + raise NotImplementedError("Unknown root tag") + + return ret + + +class WorkerThread(threading.Thread): + def __init__(self, *args, **kwargs): + self._input_queue = kwargs.pop("input_queue") + self._output_queue = kwargs.pop("output_queue") + threading.Thread.__init__(self, *args, **kwargs) + self.daemon = True + + def run(self): + while True: + (idx, url) = self._input_queue.get() + docfeed = None + try: + feed = _fetch_url(url) + if feed is not None: + docfeed = _process_feed(_filter_feed(feed)) + except Exception as e: + logging.info("(%s) exception: (%s) %s" % (url, type(e), e)) + self._output_queue.put((idx, docfeed)) + + +def main(input_queue, output_queue, lock): + ret = '' + + with lock: + logging.debug("main() started") + epoch_now = time.time() + dtnow = datetime.datetime.fromtimestamp(epoch_now) + + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): + with open(CACHE_HTML_FILE) as cache_html_file: + ret = cache_html_file.read() + + else: + with open(FEEDS_FILE) as feeds_file: + feedlines = feeds_file.readlines() + + docstruct = [None]*len(feedlines) + num_input = 0 + for (i, l) in enumerate(feedlines): + if l[0] != '#': + l = l.strip() + input_queue.put((i, l)) + num_input += 1 + + for _ in range(num_input): + (idx, docfeed) = output_queue.get() + docstruct[idx] = docfeed + + ret = _to_html(dtnow, docstruct) + + with open(CACHE_HTML_FILE, 'w') as cache_html_file: + cache_html_file.write(ret) + logging.debug("main() ended") + + return ret + + +class MyRssApp: + def __init__(self): + logging.debug("MyRssApp.__init__() called") + self._iq = queue.Queue(MAX_THREADS) + self._oq = queue.Queue(MAX_THREADS) + self._main_lock = threading.Lock() + + for i in range(MAX_THREADS): + logging.debug("Starting thread: %d" % i) + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() + + # Raw WSGI + def __call__(self, environ, start_response): + response_code = "500 Internal Server Error" + response_type = "text/plain; charset=UTF-8" + + try: + response_body = main(self._iq, self._oq, self._main_lock) + response_code = "200 OK" + response_type = "text/html; charset=UTF-8" + except: + response_body = traceback.format_exc() + + response_headers = [ + ("Content-Type", response_type), + ("Content-Length", str(len(response_body))), + ] + start_response(response_code, response_headers) + + return [bytes(response_body, encoding="utf-8")] + + def call(self): + return main(self._iq, self._oq, self._main_lock) diff -r 24a967efbf3e -r cffd95813b82 myrss2/myrss_flask.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/myrss_flask.py Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,12 @@ +from flask import Flask + +import myrss_app + + +flask_app = Flask(__name__) +my_app = myrss_app.MyRssApp() + + +@flask_app.route("/") +def index(): + return my_app.call() diff -r 24a967efbf3e -r cffd95813b82 myrss2/requirements.pip --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/requirements.pip Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,8 @@ +click==7.1.2 +Flask==1.1.2 +gunicorn==20.0.4 +html3==1.18 +itsdangerous==1.1.0 +Jinja2==2.11.2 +MarkupSafe==1.1.1 +Werkzeug==1.0.1 diff -r 24a967efbf3e -r cffd95813b82 myrss2/static/index.css --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/static/index.css Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,74 @@ +body +{ + background-color: #111; + color: #ccc; +} + +a:link +{ + color: #831; + text-decoration: none; +} + +a:hover +{ + text-decoration: underline; +} + +a:visited +{ + color: gray; +} + +table.index td +{ + padding-right: 15px; +} + +div.index_date +{ + font-size: small; + color: #666; +} + +div.email +{ + font-size: small; +} + +div.debug +{ + font-family: monospace; + font-size: x-small; +} + +h1 +{ + font-family: "sans-serif"; +} + +h2 +{ + font-family: "sans-serif"; +} + +a.z1 +{ + color: #A31; +} + +a.z2 +{ + color: #851; +} + +a.z3 +{ + color: #833; +} + +p:first-line +{ + font-weight: bold; +} + diff -r 24a967efbf3e -r cffd95813b82 myrss2/tarball.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/tarball.sh Sun May 24 00:22:05 2020 -0700 @@ -0,0 +1,12 @@ +#!/bin/sh + +FILES=" + myrss/Dockerfile + myrss/.dockerignore + myrss/FEEDS + myrss/myrss_app.py + myrss/myrss_flask.py + myrss/requirements.pip + myrss/static/index.css +" +tar -czvf myrss.tar.gz ${FILES}