# HG changeset patch # User paulo # Date 1591172420 25200 # Node ID 1a5c0fc5627a9c2e32e080b16856e313bec39cb4 # Parent a24807036601b8514a1ba6ae2e713604c4dc55c8 myrss2: fix gzip and leading whitespace handling; add test feed, test server; update FEEDS diff -r a24807036601 -r 1a5c0fc5627a myrss2/FEEDS --- a/myrss2/FEEDS Sun May 24 00:22:32 2020 -0700 +++ b/myrss2/FEEDS Wed Jun 03 01:20:20 2020 -0700 @@ -1,6 +1,7 @@ https://news.google.com/news/rss/?gl=US&ned=us&hl=en https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en -http://ep01.epimg.net/rss/elpais/portada.xml +https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada +https://ww2.kqed.org/news/feed/ http://www.sfgate.com/rss/feeds/news_pageone.xml http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php http://www.weatherwest.com/feed diff -r a24807036601 -r 1a5c0fc5627a myrss2/myrss_app.py --- a/myrss2/myrss_app.py Sun May 24 00:22:32 2020 -0700 +++ b/myrss2/myrss_app.py Wed Jun 03 01:20:20 2020 -0700 @@ -1,14 +1,15 @@ +import datetime +import gzip import io import os +import queue +import re import sys -import re -import urllib.request -import urllib.error import threading -import queue -import datetime import time import traceback +import urllib.error +import urllib.request import logging LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") @@ -165,15 +166,21 @@ try: logging.info("processing %s" % url) feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) + response_headers = feed.info().as_string().splitlines() + if 'Content-Encoding: gzip' in response_headers: + body = gzip.decompress(feed.read()) + else: + body = feed.read() + except urllib.error.HTTPError as e: logging.info("(%s) %s" % (url, e)) return None - return str(feed.read(), encoding="utf-8") + return str(body, encoding="utf-8") def _filter_feed(feed): - ret = feed + ret = feed.strip() filter_out = ["\x16"] for i in filter_out: @@ -275,7 +282,7 @@ logging.debug("Starting thread: %d" % i) WorkerThread(input_queue=self._iq, output_queue=self._oq).start() - # Raw WSGI + # Raw WSGI def __call__(self, environ, start_response): response_code = "500 Internal Server Error" response_type = "text/plain; charset=UTF-8" diff -r a24807036601 -r 1a5c0fc5627a myrss2/myrss_test_feed.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/myrss_test_feed.py Wed Jun 03 01:20:20 2020 -0700 @@ -0,0 +1,11 @@ +import datetime + +import myrss_app + + +feed = open("testfeed.xml").read() +filtered_feed = myrss_app._filter_feed(feed) +x = myrss_app._process_feed(filtered_feed) +y = myrss_app._to_html(datetime.datetime.now(), [x]) + +print(y) diff -r a24807036601 -r 1a5c0fc5627a myrss2/myrss_test_server.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/myrss2/myrss_test_server.py Wed Jun 03 01:20:20 2020 -0700 @@ -0,0 +1,9 @@ +import wsgiref.simple_server +import socketserver + +import myrss_app + + +if __name__ == "__main__": + with wsgiref.simple_server.make_server('', 8000, myrss_app.MyRssApp()) as httpd: + httpd.serve_forever()