changeset 110:1a5c0fc5627a

myrss2: fix gzip and leading whitespace handling; add test feed, test server; update FEEDS
author paulo
date Wed, 03 Jun 2020 01:20:20 -0700
parents a24807036601
children 320904a2e311
files myrss2/FEEDS myrss2/myrss_app.py myrss2/myrss_test_feed.py myrss2/myrss_test_server.py
diffstat 4 files changed, 37 insertions(+), 9 deletions(-) [+]
line diff
     1.1 --- a/myrss2/FEEDS	Sun May 24 00:22:32 2020 -0700
     1.2 +++ b/myrss2/FEEDS	Wed Jun 03 01:20:20 2020 -0700
     1.3 @@ -1,6 +1,7 @@
     1.4  https://news.google.com/news/rss/?gl=US&ned=us&hl=en
     1.5  https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en
     1.6 -http://ep01.epimg.net/rss/elpais/portada.xml
     1.7 +https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada
     1.8 +https://ww2.kqed.org/news/feed/
     1.9  http://www.sfgate.com/rss/feeds/news_pageone.xml
    1.10  http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php
    1.11  http://www.weatherwest.com/feed
     2.1 --- a/myrss2/myrss_app.py	Sun May 24 00:22:32 2020 -0700
     2.2 +++ b/myrss2/myrss_app.py	Wed Jun 03 01:20:20 2020 -0700
     2.3 @@ -1,14 +1,15 @@
     2.4 +import datetime
     2.5 +import gzip
     2.6  import io
     2.7  import os
     2.8 +import queue
     2.9 +import re
    2.10  import sys
    2.11 -import re
    2.12 -import urllib.request
    2.13 -import urllib.error
    2.14  import threading
    2.15 -import queue
    2.16 -import datetime
    2.17  import time
    2.18  import traceback
    2.19 +import urllib.error
    2.20 +import urllib.request
    2.21  
    2.22  import logging
    2.23  LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
    2.24 @@ -165,15 +166,21 @@
    2.25    try:
    2.26      logging.info("processing %s" % url)
    2.27      feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT)
    2.28 +    response_headers = feed.info().as_string().splitlines()
    2.29 +    if 'Content-Encoding: gzip' in response_headers:
    2.30 +      body = gzip.decompress(feed.read())
    2.31 +    else:
    2.32 +      body = feed.read()
    2.33 +
    2.34    except urllib.error.HTTPError as e:
    2.35      logging.info("(%s) %s" % (url, e))
    2.36      return None
    2.37  
    2.38 -  return str(feed.read(), encoding="utf-8")
    2.39 +  return str(body, encoding="utf-8")
    2.40  
    2.41  
    2.42  def _filter_feed(feed):
    2.43 -  ret = feed
    2.44 +  ret = feed.strip()
    2.45  
    2.46    filter_out = ["\x16"]
    2.47    for i in filter_out:
    2.48 @@ -275,7 +282,7 @@
    2.49        logging.debug("Starting thread: %d" % i)
    2.50        WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
    2.51     
    2.52 -   # Raw WSGI
    2.53 +  # Raw WSGI
    2.54    def __call__(self, environ, start_response):
    2.55      response_code = "500 Internal Server Error"
    2.56      response_type = "text/plain; charset=UTF-8"
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/myrss2/myrss_test_feed.py	Wed Jun 03 01:20:20 2020 -0700
     3.3 @@ -0,0 +1,11 @@
     3.4 +import datetime
     3.5 +
     3.6 +import myrss_app
     3.7 +
     3.8 +
     3.9 +feed = open("testfeed.xml").read()
    3.10 +filtered_feed = myrss_app._filter_feed(feed)
    3.11 +x = myrss_app._process_feed(filtered_feed)
    3.12 +y = myrss_app._to_html(datetime.datetime.now(), [x])
    3.13 +
    3.14 +print(y)
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/myrss2/myrss_test_server.py	Wed Jun 03 01:20:20 2020 -0700
     4.3 @@ -0,0 +1,9 @@
     4.4 +import wsgiref.simple_server 
     4.5 +import socketserver
     4.6 +
     4.7 +import myrss_app
     4.8 +
     4.9 +
    4.10 +if __name__ == "__main__":
    4.11 +	with wsgiref.simple_server.make_server('', 8000, myrss_app.MyRssApp()) as httpd:
    4.12 +		httpd.serve_forever()