Mercurial > hg > index.fcgi > www > www-1
changeset 110:1a5c0fc5627a
myrss2: fix gzip and leading whitespace handling; add test feed, test server; update FEEDS
author | paulo |
---|---|
date | Wed, 03 Jun 2020 01:20:20 -0700 |
parents | a24807036601 |
children | 320904a2e311 |
files | myrss2/FEEDS myrss2/myrss_app.py myrss2/myrss_test_feed.py myrss2/myrss_test_server.py |
diffstat | 4 files changed, 37 insertions(+), 9 deletions(-) [+] |
line diff
1.1 --- a/myrss2/FEEDS Sun May 24 00:22:32 2020 -0700 1.2 +++ b/myrss2/FEEDS Wed Jun 03 01:20:20 2020 -0700 1.3 @@ -1,6 +1,7 @@ 1.4 https://news.google.com/news/rss/?gl=US&ned=us&hl=en 1.5 https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en 1.6 -http://ep01.epimg.net/rss/elpais/portada.xml 1.7 +https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada 1.8 +https://ww2.kqed.org/news/feed/ 1.9 http://www.sfgate.com/rss/feeds/news_pageone.xml 1.10 http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php 1.11 http://www.weatherwest.com/feed
2.1 --- a/myrss2/myrss_app.py Sun May 24 00:22:32 2020 -0700 2.2 +++ b/myrss2/myrss_app.py Wed Jun 03 01:20:20 2020 -0700 2.3 @@ -1,14 +1,15 @@ 2.4 +import datetime 2.5 +import gzip 2.6 import io 2.7 import os 2.8 +import queue 2.9 +import re 2.10 import sys 2.11 -import re 2.12 -import urllib.request 2.13 -import urllib.error 2.14 import threading 2.15 -import queue 2.16 -import datetime 2.17 import time 2.18 import traceback 2.19 +import urllib.error 2.20 +import urllib.request 2.21 2.22 import logging 2.23 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") 2.24 @@ -165,15 +166,21 @@ 2.25 try: 2.26 logging.info("processing %s" % url) 2.27 feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) 2.28 + response_headers = feed.info().as_string().splitlines() 2.29 + if 'Content-Encoding: gzip' in response_headers: 2.30 + body = gzip.decompress(feed.read()) 2.31 + else: 2.32 + body = feed.read() 2.33 + 2.34 except urllib.error.HTTPError as e: 2.35 logging.info("(%s) %s" % (url, e)) 2.36 return None 2.37 2.38 - return str(feed.read(), encoding="utf-8") 2.39 + return str(body, encoding="utf-8") 2.40 2.41 2.42 def _filter_feed(feed): 2.43 - ret = feed 2.44 + ret = feed.strip() 2.45 2.46 filter_out = ["\x16"] 2.47 for i in filter_out: 2.48 @@ -275,7 +282,7 @@ 2.49 logging.debug("Starting thread: %d" % i) 2.50 WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 2.51 2.52 - # Raw WSGI 2.53 + # Raw WSGI 2.54 def __call__(self, environ, start_response): 2.55 response_code = "500 Internal Server Error" 2.56 response_type = "text/plain; charset=UTF-8"
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/myrss2/myrss_test_feed.py Wed Jun 03 01:20:20 2020 -0700 3.3 @@ -0,0 +1,11 @@ 3.4 +import datetime 3.5 + 3.6 +import myrss_app 3.7 + 3.8 + 3.9 +feed = open("testfeed.xml").read() 3.10 +filtered_feed = myrss_app._filter_feed(feed) 3.11 +x = myrss_app._process_feed(filtered_feed) 3.12 +y = myrss_app._to_html(datetime.datetime.now(), [x]) 3.13 + 3.14 +print(y)
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/myrss2/myrss_test_server.py Wed Jun 03 01:20:20 2020 -0700 4.3 @@ -0,0 +1,9 @@ 4.4 +import wsgiref.simple_server 4.5 +import socketserver 4.6 + 4.7 +import myrss_app 4.8 + 4.9 + 4.10 +if __name__ == "__main__": 4.11 + with wsgiref.simple_server.make_server('', 8000, myrss_app.MyRssApp()) as httpd: 4.12 + httpd.serve_forever()