diff myrss2/myrss_app.py @ 110:1a5c0fc5627a

myrss2: fix gzip and leading whitespace handling; add test feed, test server; update FEEDS
author paulo
date Wed, 03 Jun 2020 01:20:20 -0700
parents cffd95813b82
children 2ed8cf5f36bf
line diff
     1.1 --- a/myrss2/myrss_app.py	Sun May 24 00:22:32 2020 -0700
     1.2 +++ b/myrss2/myrss_app.py	Wed Jun 03 01:20:20 2020 -0700
     1.3 @@ -1,14 +1,15 @@
     1.4 +import datetime
     1.5 +import gzip
     1.6  import io
     1.7  import os
     1.8 +import queue
     1.9 +import re
    1.10  import sys
    1.11 -import re
    1.12 -import urllib.request
    1.13 -import urllib.error
    1.14  import threading
    1.15 -import queue
    1.16 -import datetime
    1.17  import time
    1.18  import traceback
    1.19 +import urllib.error
    1.20 +import urllib.request
    1.21  
    1.22  import logging
    1.23  LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
    1.24 @@ -165,15 +166,21 @@
    1.25    try:
    1.26      logging.info("processing %s" % url)
    1.27      feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT)
    1.28 +    response_headers = feed.info().as_string().splitlines()
    1.29 +    if 'Content-Encoding: gzip' in response_headers:
    1.30 +      body = gzip.decompress(feed.read())
    1.31 +    else:
    1.32 +      body = feed.read()
    1.33 +
    1.34    except urllib.error.HTTPError as e:
    1.35      logging.info("(%s) %s" % (url, e))
    1.36      return None
    1.37  
    1.38 -  return str(feed.read(), encoding="utf-8")
    1.39 +  return str(body, encoding="utf-8")
    1.40  
    1.41  
    1.42  def _filter_feed(feed):
    1.43 -  ret = feed
    1.44 +  ret = feed.strip()
    1.45  
    1.46    filter_out = ["\x16"]
    1.47    for i in filter_out:
    1.48 @@ -275,7 +282,7 @@
    1.49        logging.debug("Starting thread: %d" % i)
    1.50        WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
    1.51     
    1.52 -   # Raw WSGI
    1.53 +  # Raw WSGI
    1.54    def __call__(self, environ, start_response):
    1.55      response_code = "500 Internal Server Error"
    1.56      response_type = "text/plain; charset=UTF-8"