diff myrss2/myrss_app.py @ 108:cffd95813b82

add myrss2
author paulo
date Sun, 24 May 2020 00:22:05 -0700
parents
children 1a5c0fc5627a
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/myrss2/myrss_app.py	Sun May 24 00:22:05 2020 -0700
     1.3 @@ -0,0 +1,299 @@
     1.4 +import io
     1.5 +import os
     1.6 +import sys
     1.7 +import re
     1.8 +import urllib.request
     1.9 +import urllib.error
    1.10 +import threading
    1.11 +import queue
    1.12 +import datetime
    1.13 +import time
    1.14 +import traceback
    1.15 +
    1.16 +import logging
    1.17 +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
    1.18 +logging.basicConfig(
    1.19 +  level=getattr(logging, LOG_LEVEL),
    1.20 +  format="%(asctime)s %(levelname)-8s %(message)s",
    1.21 +)
    1.22 +
    1.23 +import xml.etree.ElementTree 
    1.24 +import html
    1.25 +
    1.26 +from html3.html3 import HTML
    1.27 +
    1.28 +
    1.29 +FEEDS_FILE = "FEEDS"
    1.30 +CACHE_HTML_FILE = "__cache__.html"
    1.31 +
    1.32 +CACHE_LIFE = 1200 # [seconds]
    1.33 +MAX_ITEMS = 50
    1.34 +MAX_LINK_Z = 4
    1.35 +MAX_THREADS = 20
    1.36 +URLOPEN_TIMEOUT = 10 # [seconds]
    1.37 +
    1.38 +
    1.39 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
    1.40 +
    1.41 +def _parse_root_tag(root_tag):
    1.42 +  re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
    1.43 +
    1.44 +  if re_match is None:
    1.45 +    return (None, None)
    1.46 +  else:
    1.47 +    return re_match.group(2, 3)
    1.48 +  
    1.49 +
    1.50 +def _strip_if_not_none(txt):
    1.51 +  return txt.strip() if txt is not None else ''
    1.52 +
    1.53 +
    1.54 +def _go_rss(elementTree):
    1.55 +  title = _strip_if_not_none(elementTree.find("channel/title").text)
    1.56 +  link = elementTree.find("channel/link").text
    1.57 +
    1.58 +  items = []
    1.59 +
    1.60 +  for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    1.61 +    it_title = _strip_if_not_none(i.find("title").text)
    1.62 +    it_link = i.find("link").text
    1.63 +
    1.64 +    items.append((it_title, it_link))
    1.65 +
    1.66 +  return (title, link, items)
    1.67 +
    1.68 +
    1.69 +def _go_atom(elementTree):
    1.70 +  ns = "http://www.w3.org/2005/Atom"
    1.71 +
    1.72 +  title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text)
    1.73 +  link = ''
    1.74 +
    1.75 +  links = elementTree.findall("{%s}link" % ns)
    1.76 +  for i in links:
    1.77 +    if len(links) == 1 or i.get("rel") == "alternate":
    1.78 +      link = i.get("href")
    1.79 +      break
    1.80 +
    1.81 +  items = []
    1.82 +
    1.83 +  for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    1.84 +    it_title = _strip_if_not_none(i.find("{%s}title" % ns).text)
    1.85 +    it_link = ''
    1.86 +    
    1.87 +    it_links = i.findall("{%s}link" % ns)
    1.88 +    for j in it_links:
    1.89 +      if len(it_links) == 1 or j.get("rel") == "alternate":
    1.90 +        it_link = j.get("href")
    1.91 +        break
    1.92 +
    1.93 +    items.append((it_title, it_link))
    1.94 +
    1.95 +  return (title, link, items)
    1.96 +
    1.97 +
    1.98 +def _go_purl_rss(elementTree):
    1.99 +  ns = "http://purl.org/rss/1.0/"
   1.100 +
   1.101 +  title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text)
   1.102 +  link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text
   1.103 +
   1.104 +  items = []
   1.105 +
   1.106 +  for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]:
   1.107 +    it_title = _strip_if_not_none(i.find("{%s}title" % ns).text)
   1.108 +    it_link = i.find("{%s}link" % ns).text
   1.109 +
   1.110 +    items.append((it_title, it_link))
   1.111 +
   1.112 +  return (title, link, items)
   1.113 +
   1.114 +
   1.115 +_STRIP_HTML_RE = re.compile(r"<.*?>")
   1.116 +
   1.117 +def _strip_html(txt):
   1.118 +  return html.unescape(_STRIP_HTML_RE.sub('', txt))
   1.119 +  
   1.120 +
   1.121 +def _to_html(dtnow, docstruct):
   1.122 +  datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
   1.123 +  page_title = "myrss -- %s" % datetime_str
   1.124 +
   1.125 +  root = HTML("html")
   1.126 +
   1.127 +  header = root.head
   1.128 +  header.meta(name="viewport", content="width=device-width, initial-scale=1")
   1.129 +  header.title(page_title)
   1.130 +  header.link(rel="stylesheet", type="text/css", href="static/index.css")
   1.131 +
   1.132 +  body = root.body
   1.133 +  body.h1(page_title)
   1.134 +
   1.135 +  link_z = 0
   1.136 +
   1.137 +  for feed in docstruct:
   1.138 +    if feed is None:
   1.139 +      continue
   1.140 +
   1.141 +    (title, link, items) = feed
   1.142 +
   1.143 +    logging.debug("title: %s", title)
   1.144 +    body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z))
   1.145 +    link_z += 1
   1.146 +    p = body.p
   1.147 +
   1.148 +    for (i, (it_title, it_link)) in enumerate(items):
   1.149 +      if i > 0:
   1.150 +        p += " - "
   1.151 +
   1.152 +      if not it_title:
   1.153 +        it_title = "(missing title)"
   1.154 +      if it_link is not None:
   1.155 +        p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
   1.156 +      else:
   1.157 +        p += _strip_html(it_title)
   1.158 +        
   1.159 +      link_z += 1
   1.160 +
   1.161 +  dtdelta = datetime.datetime.now() - dtnow
   1.162 +  root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug")
   1.163 +
   1.164 +  return str(root)
   1.165 +
   1.166 +
   1.167 +def _fetch_url(url):
   1.168 +  try:
   1.169 +    logging.info("processing %s" % url)
   1.170 +    feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT)
   1.171 +  except urllib.error.HTTPError as e:
   1.172 +    logging.info("(%s) %s" % (url, e))
   1.173 +    return None
   1.174 +
   1.175 +  return str(feed.read(), encoding="utf-8")
   1.176 +
   1.177 +
   1.178 +def _filter_feed(feed):
   1.179 +  ret = feed
   1.180 +
   1.181 +  filter_out = ["\x16"]
   1.182 +  for i in filter_out:
   1.183 +    ret = ret.replace(i, "")
   1.184 +
   1.185 +  return ret
   1.186 +
   1.187 +
   1.188 +def _process_feed(feed):
   1.189 +  ret = None
   1.190 +
   1.191 +  feed_sio = io.StringIO(feed)
   1.192 +  elementTree = xml.etree.ElementTree.parse(feed_sio)
   1.193 +  root = elementTree.getroot()
   1.194 +
   1.195 +  parsed_root_tag = _parse_root_tag(root.tag) 
   1.196 +
   1.197 +  if parsed_root_tag == (None, "rss"):
   1.198 +    version = float(root.get("version", 0.0))
   1.199 +    if version >= 2.0:
   1.200 +      ret = _go_rss(elementTree)
   1.201 +    else:
   1.202 +      raise NotImplementedError("Unsupported rss version")
   1.203 +  elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
   1.204 +    ret = _go_atom(elementTree)
   1.205 +  elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"):
   1.206 +    ret = _go_purl_rss(elementTree)
   1.207 +  else:
   1.208 +    raise NotImplementedError("Unknown root tag")
   1.209 +
   1.210 +  return ret
   1.211 +
   1.212 +
   1.213 +class WorkerThread(threading.Thread):
   1.214 +  def __init__(self, *args, **kwargs):
   1.215 +    self._input_queue = kwargs.pop("input_queue")
   1.216 +    self._output_queue = kwargs.pop("output_queue")
   1.217 +    threading.Thread.__init__(self, *args, **kwargs)
   1.218 +    self.daemon = True
   1.219 +
   1.220 +  def run(self):
   1.221 +    while True:
   1.222 +      (idx, url) = self._input_queue.get()
   1.223 +      docfeed = None
   1.224 +      try:
   1.225 +        feed = _fetch_url(url)
   1.226 +        if feed is not None:
   1.227 +          docfeed = _process_feed(_filter_feed(feed))
   1.228 +      except Exception as e:
   1.229 +        logging.info("(%s) exception: (%s) %s" % (url, type(e), e))
   1.230 +      self._output_queue.put((idx, docfeed))
   1.231 +      
   1.232 +
   1.233 +def main(input_queue, output_queue, lock):
   1.234 +  ret = ''
   1.235 +
   1.236 +  with lock:
   1.237 +    logging.debug("main() started")
   1.238 +    epoch_now = time.time()
   1.239 +    dtnow = datetime.datetime.fromtimestamp(epoch_now)
   1.240 +
   1.241 +    if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
   1.242 +      with open(CACHE_HTML_FILE) as cache_html_file:
   1.243 +        ret = cache_html_file.read()
   1.244 +
   1.245 +    else:
   1.246 +      with open(FEEDS_FILE) as feeds_file:
   1.247 +        feedlines = feeds_file.readlines()
   1.248 +
   1.249 +      docstruct = [None]*len(feedlines)
   1.250 +      num_input = 0
   1.251 +      for (i, l) in enumerate(feedlines):
   1.252 +        if l[0] != '#':
   1.253 +          l = l.strip()
   1.254 +          input_queue.put((i, l))
   1.255 +          num_input += 1
   1.256 +
   1.257 +      for _ in range(num_input):
   1.258 +        (idx, docfeed) = output_queue.get()
   1.259 +        docstruct[idx] = docfeed
   1.260 +
   1.261 +      ret = _to_html(dtnow, docstruct)
   1.262 +
   1.263 +      with open(CACHE_HTML_FILE, 'w') as cache_html_file:
   1.264 +        cache_html_file.write(ret)
   1.265 +    logging.debug("main() ended")
   1.266 +
   1.267 +  return ret
   1.268 +
   1.269 +
   1.270 +class MyRssApp:
   1.271 +  def __init__(self):
   1.272 +    logging.debug("MyRssApp.__init__() called")
   1.273 +    self._iq = queue.Queue(MAX_THREADS)
   1.274 +    self._oq = queue.Queue(MAX_THREADS)
   1.275 +    self._main_lock = threading.Lock()
   1.276 +
   1.277 +    for i in range(MAX_THREADS):
   1.278 +      logging.debug("Starting thread: %d" % i)
   1.279 +      WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
   1.280 +   
   1.281 +   # Raw WSGI
   1.282 +  def __call__(self, environ, start_response):
   1.283 +    response_code = "500 Internal Server Error"
   1.284 +    response_type = "text/plain; charset=UTF-8"
   1.285 +
   1.286 +    try:
   1.287 +      response_body = main(self._iq, self._oq, self._main_lock)
   1.288 +      response_code = "200 OK"
   1.289 +      response_type = "text/html; charset=UTF-8"
   1.290 +    except:
   1.291 +      response_body = traceback.format_exc()
   1.292 +
   1.293 +    response_headers = [
   1.294 +      ("Content-Type", response_type),
   1.295 +      ("Content-Length", str(len(response_body))),
   1.296 +    ]
   1.297 +    start_response(response_code, response_headers)
   1.298 +
   1.299 +    return [bytes(response_body, encoding="utf-8")]
   1.300 +
   1.301 +  def call(self):
   1.302 +    return main(self._iq, self._oq, self._main_lock)