changeset 108:cffd95813b82

add myrss2
author paulo
date Sun, 24 May 2020 00:22:05 -0700
parents 24a967efbf3e
children a24807036601
files myrss2/.dockerignore myrss2/Dockerfile myrss2/FEEDS myrss2/myrss_app.py myrss2/myrss_flask.py myrss2/requirements.pip myrss2/static/index.css myrss2/tarball.sh
diffstat 8 files changed, 471 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/myrss2/.dockerignore	Sun May 24 00:22:05 2020 -0700
     1.3 @@ -0,0 +1,6 @@
     1.4 +Dockerfile
     1.5 +README.md
     1.6 +*.pyc
     1.7 +*.pyo
     1.8 +*.pyd
     1.9 +__pycache__
    1.10 \ No newline at end of file
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/myrss2/Dockerfile	Sun May 24 00:22:05 2020 -0700
     2.3 @@ -0,0 +1,17 @@
     2.4 +# Use the official lightweight Python image.
     2.5 +# https://hub.docker.com/_/python
     2.6 +FROM python:3.6-slim 
     2.7 +
     2.8 +# Copy local code to the container image.
     2.9 +ENV APP_HOME /app
    2.10 +WORKDIR $APP_HOME
    2.11 +COPY . ./
    2.12 +
    2.13 +# Install production dependencies.
    2.14 +RUN pip install -r requirements.pip
    2.15 +
    2.16 +# Run the web service on container startup. Here we use the gunicorn
    2.17 +# webserver, with one worker process and 8 threads.
    2.18 +# For environments with multiple CPU cores, increase the number of workers
    2.19 +# to be equal to the cores available.
    2.20 +CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 myrss_flask:flask_app
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/myrss2/FEEDS	Sun May 24 00:22:05 2020 -0700
     3.3 @@ -0,0 +1,43 @@
     3.4 +https://news.google.com/news/rss/?gl=US&ned=us&hl=en
     3.5 +https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en
     3.6 +http://ep01.epimg.net/rss/elpais/portada.xml
     3.7 +http://www.sfgate.com/rss/feeds/news_pageone.xml
     3.8 +http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php
     3.9 +http://www.weatherwest.com/feed
    3.10 +http://sports.yahoo.com/top/rss.xml
    3.11 +http://rss.slashdot.org/Slashdot/slashdot
    3.12 +http://www.theverge.com/rss/index.xml
    3.13 +http://www.reddit.com/.rss
    3.14 +http://feeds.feedburner.com/Metafilter
    3.15 +http://feeds.feedburner.com/AskMetafilter
    3.16 +http://www.vice.com/rss
    3.17 +http://feeds.slate.com/slate
    3.18 +http://feeds.feedburner.com/TheAtlantic
    3.19 +http://aeon.co/magazine/feed/
    3.20 +http://nautil.us/rss/all
    3.21 +https://api.quantamagazine.org/feed/
    3.22 +http://undark.org/feed/
    3.23 +http://priceonomics.com/latest.rss
    3.24 +https://harpers.org/feed/
    3.25 +http://roadsandkingdoms.com/feed/
    3.26 +http://thebaffler.com/feed
    3.27 +http://www.edge.org/feed
    3.28 +http://quillette.com/feed/
    3.29 +http://thebrowser.com/feed
    3.30 +http://longform.org/feed/
    3.31 +http://longreads.com/rss/
    3.32 +https://www.project-syndicate.org/rss
    3.33 +http://lifehacker.com/rss
    3.34 +http://bikesnobnyc.blogspot.com/feeds/posts/default
    3.35 +http://feeds.feedburner.com/bikehugger
    3.36 +http://jalopnik.com/rss
    3.37 +https://www.rideapart.com/rss/articles/all/
    3.38 +http://feeds2.feedburner.com/Bikeexif
    3.39 +http://feeds2.feedburner.com/Cycleexif
    3.40 +http://www.xkcd.com/rss.xml
    3.41 +http://feeds.kottke.org/main
    3.42 +http://feeds.feedburner.com/OpenCulture
    3.43 +http://feeds.feedburner.com/shorpy?q=rss.xml
    3.44 +http://feeds.feedburner.com/codinghorror
    3.45 +https://danielmiessler.com/feed/
    3.46 +http://syndication.thedailywtf.com/TheDailyWtf
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/myrss2/myrss_app.py	Sun May 24 00:22:05 2020 -0700
     4.3 @@ -0,0 +1,299 @@
     4.4 +import io
     4.5 +import os
     4.6 +import sys
     4.7 +import re
     4.8 +import urllib.request
     4.9 +import urllib.error
    4.10 +import threading
    4.11 +import queue
    4.12 +import datetime
    4.13 +import time
    4.14 +import traceback
    4.15 +
    4.16 +import logging
    4.17 +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
    4.18 +logging.basicConfig(
    4.19 +  level=getattr(logging, LOG_LEVEL),
    4.20 +  format="%(asctime)s %(levelname)-8s %(message)s",
    4.21 +)
    4.22 +
    4.23 +import xml.etree.ElementTree 
    4.24 +import html
    4.25 +
    4.26 +from html3.html3 import HTML
    4.27 +
    4.28 +
    4.29 +FEEDS_FILE = "FEEDS"
    4.30 +CACHE_HTML_FILE = "__cache__.html"
    4.31 +
    4.32 +CACHE_LIFE = 1200 # [seconds]
    4.33 +MAX_ITEMS = 50
    4.34 +MAX_LINK_Z = 4
    4.35 +MAX_THREADS = 20
    4.36 +URLOPEN_TIMEOUT = 10 # [seconds]
    4.37 +
    4.38 +
    4.39 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
    4.40 +
    4.41 +def _parse_root_tag(root_tag):
    4.42 +  re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
    4.43 +
    4.44 +  if re_match is None:
    4.45 +    return (None, None)
    4.46 +  else:
    4.47 +    return re_match.group(2, 3)
    4.48 +  
    4.49 +
    4.50 +def _strip_if_not_none(txt):
    4.51 +  return txt.strip() if txt is not None else ''
    4.52 +
    4.53 +
    4.54 +def _go_rss(elementTree):
    4.55 +  title = _strip_if_not_none(elementTree.find("channel/title").text)
    4.56 +  link = elementTree.find("channel/link").text
    4.57 +
    4.58 +  items = []
    4.59 +
    4.60 +  for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    4.61 +    it_title = _strip_if_not_none(i.find("title").text)
    4.62 +    it_link = i.find("link").text
    4.63 +
    4.64 +    items.append((it_title, it_link))
    4.65 +
    4.66 +  return (title, link, items)
    4.67 +
    4.68 +
    4.69 +def _go_atom(elementTree):
    4.70 +  ns = "http://www.w3.org/2005/Atom"
    4.71 +
    4.72 +  title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text)
    4.73 +  link = ''
    4.74 +
    4.75 +  links = elementTree.findall("{%s}link" % ns)
    4.76 +  for i in links:
    4.77 +    if len(links) == 1 or i.get("rel") == "alternate":
    4.78 +      link = i.get("href")
    4.79 +      break
    4.80 +
    4.81 +  items = []
    4.82 +
    4.83 +  for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    4.84 +    it_title = _strip_if_not_none(i.find("{%s}title" % ns).text)
    4.85 +    it_link = ''
    4.86 +    
    4.87 +    it_links = i.findall("{%s}link" % ns)
    4.88 +    for j in it_links:
    4.89 +      if len(it_links) == 1 or j.get("rel") == "alternate":
    4.90 +        it_link = j.get("href")
    4.91 +        break
    4.92 +
    4.93 +    items.append((it_title, it_link))
    4.94 +
    4.95 +  return (title, link, items)
    4.96 +
    4.97 +
    4.98 +def _go_purl_rss(elementTree):
    4.99 +  ns = "http://purl.org/rss/1.0/"
   4.100 +
   4.101 +  title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text)
   4.102 +  link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text
   4.103 +
   4.104 +  items = []
   4.105 +
   4.106 +  for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]:
   4.107 +    it_title = _strip_if_not_none(i.find("{%s}title" % ns).text)
   4.108 +    it_link = i.find("{%s}link" % ns).text
   4.109 +
   4.110 +    items.append((it_title, it_link))
   4.111 +
   4.112 +  return (title, link, items)
   4.113 +
   4.114 +
   4.115 +_STRIP_HTML_RE = re.compile(r"<.*?>")
   4.116 +
   4.117 +def _strip_html(txt):
   4.118 +  return html.unescape(_STRIP_HTML_RE.sub('', txt))
   4.119 +  
   4.120 +
   4.121 +def _to_html(dtnow, docstruct):
   4.122 +  datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
   4.123 +  page_title = "myrss -- %s" % datetime_str
   4.124 +
   4.125 +  root = HTML("html")
   4.126 +
   4.127 +  header = root.head
   4.128 +  header.meta(name="viewport", content="width=device-width, initial-scale=1")
   4.129 +  header.title(page_title)
   4.130 +  header.link(rel="stylesheet", type="text/css", href="static/index.css")
   4.131 +
   4.132 +  body = root.body
   4.133 +  body.h1(page_title)
   4.134 +
   4.135 +  link_z = 0
   4.136 +
   4.137 +  for feed in docstruct:
   4.138 +    if feed is None:
   4.139 +      continue
   4.140 +
   4.141 +    (title, link, items) = feed
   4.142 +
   4.143 +    logging.debug("title: %s", title)
   4.144 +    body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z))
   4.145 +    link_z += 1
   4.146 +    p = body.p
   4.147 +
   4.148 +    for (i, (it_title, it_link)) in enumerate(items):
   4.149 +      if i > 0:
   4.150 +        p += " - "
   4.151 +
   4.152 +      if not it_title:
   4.153 +        it_title = "(missing title)"
   4.154 +      if it_link is not None:
   4.155 +        p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
   4.156 +      else:
   4.157 +        p += _strip_html(it_title)
   4.158 +        
   4.159 +      link_z += 1
   4.160 +
   4.161 +  dtdelta = datetime.datetime.now() - dtnow
   4.162 +  root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug")
   4.163 +
   4.164 +  return str(root)
   4.165 +
   4.166 +
   4.167 +def _fetch_url(url):
   4.168 +  try:
   4.169 +    logging.info("processing %s" % url)
   4.170 +    feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT)
   4.171 +  except urllib.error.HTTPError as e:
   4.172 +    logging.info("(%s) %s" % (url, e))
   4.173 +    return None
   4.174 +
   4.175 +  return str(feed.read(), encoding="utf-8")
   4.176 +
   4.177 +
   4.178 +def _filter_feed(feed):
   4.179 +  ret = feed
   4.180 +
   4.181 +  filter_out = ["\x16"]
   4.182 +  for i in filter_out:
   4.183 +    ret = ret.replace(i, "")
   4.184 +
   4.185 +  return ret
   4.186 +
   4.187 +
   4.188 +def _process_feed(feed):
   4.189 +  ret = None
   4.190 +
   4.191 +  feed_sio = io.StringIO(feed)
   4.192 +  elementTree = xml.etree.ElementTree.parse(feed_sio)
   4.193 +  root = elementTree.getroot()
   4.194 +
   4.195 +  parsed_root_tag = _parse_root_tag(root.tag) 
   4.196 +
   4.197 +  if parsed_root_tag == (None, "rss"):
   4.198 +    version = float(root.get("version", 0.0))
   4.199 +    if version >= 2.0:
   4.200 +      ret = _go_rss(elementTree)
   4.201 +    else:
   4.202 +      raise NotImplementedError("Unsupported rss version")
   4.203 +  elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
   4.204 +    ret = _go_atom(elementTree)
   4.205 +  elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"):
   4.206 +    ret = _go_purl_rss(elementTree)
   4.207 +  else:
   4.208 +    raise NotImplementedError("Unknown root tag")
   4.209 +
   4.210 +  return ret
   4.211 +
   4.212 +
   4.213 +class WorkerThread(threading.Thread):
   4.214 +  def __init__(self, *args, **kwargs):
   4.215 +    self._input_queue = kwargs.pop("input_queue")
   4.216 +    self._output_queue = kwargs.pop("output_queue")
   4.217 +    threading.Thread.__init__(self, *args, **kwargs)
   4.218 +    self.daemon = True
   4.219 +
   4.220 +  def run(self):
   4.221 +    while True:
   4.222 +      (idx, url) = self._input_queue.get()
   4.223 +      docfeed = None
   4.224 +      try:
   4.225 +        feed = _fetch_url(url)
   4.226 +        if feed is not None:
   4.227 +          docfeed = _process_feed(_filter_feed(feed))
   4.228 +      except Exception as e:
   4.229 +        logging.info("(%s) exception: (%s) %s" % (url, type(e), e))
   4.230 +      self._output_queue.put((idx, docfeed))
   4.231 +      
   4.232 +
   4.233 +def main(input_queue, output_queue, lock):
   4.234 +  ret = ''
   4.235 +
   4.236 +  with lock:
   4.237 +    logging.debug("main() started")
   4.238 +    epoch_now = time.time()
   4.239 +    dtnow = datetime.datetime.fromtimestamp(epoch_now)
   4.240 +
   4.241 +    if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
   4.242 +      with open(CACHE_HTML_FILE) as cache_html_file:
   4.243 +        ret = cache_html_file.read()
   4.244 +
   4.245 +    else:
   4.246 +      with open(FEEDS_FILE) as feeds_file:
   4.247 +        feedlines = feeds_file.readlines()
   4.248 +
   4.249 +      docstruct = [None]*len(feedlines)
   4.250 +      num_input = 0
   4.251 +      for (i, l) in enumerate(feedlines):
   4.252 +        if l[0] != '#':
   4.253 +          l = l.strip()
   4.254 +          input_queue.put((i, l))
   4.255 +          num_input += 1
   4.256 +
   4.257 +      for _ in range(num_input):
   4.258 +        (idx, docfeed) = output_queue.get()
   4.259 +        docstruct[idx] = docfeed
   4.260 +
   4.261 +      ret = _to_html(dtnow, docstruct)
   4.262 +
   4.263 +      with open(CACHE_HTML_FILE, 'w') as cache_html_file:
   4.264 +        cache_html_file.write(ret)
   4.265 +    logging.debug("main() ended")
   4.266 +
   4.267 +  return ret
   4.268 +
   4.269 +
   4.270 +class MyRssApp:
   4.271 +  def __init__(self):
   4.272 +    logging.debug("MyRssApp.__init__() called")
   4.273 +    self._iq = queue.Queue(MAX_THREADS)
   4.274 +    self._oq = queue.Queue(MAX_THREADS)
   4.275 +    self._main_lock = threading.Lock()
   4.276 +
   4.277 +    for i in range(MAX_THREADS):
   4.278 +      logging.debug("Starting thread: %d" % i)
   4.279 +      WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
   4.280 +   
   4.281 +   # Raw WSGI
   4.282 +  def __call__(self, environ, start_response):
   4.283 +    response_code = "500 Internal Server Error"
   4.284 +    response_type = "text/plain; charset=UTF-8"
   4.285 +
   4.286 +    try:
   4.287 +      response_body = main(self._iq, self._oq, self._main_lock)
   4.288 +      response_code = "200 OK"
   4.289 +      response_type = "text/html; charset=UTF-8"
   4.290 +    except:
   4.291 +      response_body = traceback.format_exc()
   4.292 +
   4.293 +    response_headers = [
   4.294 +      ("Content-Type", response_type),
   4.295 +      ("Content-Length", str(len(response_body))),
   4.296 +    ]
   4.297 +    start_response(response_code, response_headers)
   4.298 +
   4.299 +    return [bytes(response_body, encoding="utf-8")]
   4.300 +
   4.301 +  def call(self):
   4.302 +    return main(self._iq, self._oq, self._main_lock)
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/myrss2/myrss_flask.py	Sun May 24 00:22:05 2020 -0700
     5.3 @@ -0,0 +1,12 @@
     5.4 +from flask import Flask
     5.5 +
     5.6 +import myrss_app
     5.7 +
     5.8 +
     5.9 +flask_app = Flask(__name__)
    5.10 +my_app = myrss_app.MyRssApp()
    5.11 +
    5.12 +
    5.13 +@flask_app.route("/")
    5.14 +def index():
    5.15 +  return my_app.call()
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/myrss2/requirements.pip	Sun May 24 00:22:05 2020 -0700
     6.3 @@ -0,0 +1,8 @@
     6.4 +click==7.1.2
     6.5 +Flask==1.1.2
     6.6 +gunicorn==20.0.4
     6.7 +html3==1.18
     6.8 +itsdangerous==1.1.0
     6.9 +Jinja2==2.11.2
    6.10 +MarkupSafe==1.1.1
    6.11 +Werkzeug==1.0.1
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/myrss2/static/index.css	Sun May 24 00:22:05 2020 -0700
     7.3 @@ -0,0 +1,74 @@
     7.4 +body
     7.5 +{
     7.6 +	background-color: #111;
     7.7 +	color: #ccc;
     7.8 +}
     7.9 +
    7.10 +a:link
    7.11 +{
    7.12 +	color: #831;
    7.13 +	text-decoration: none;
    7.14 +}
    7.15 +
    7.16 +a:hover
    7.17 +{
    7.18 +	text-decoration: underline;
    7.19 +}
    7.20 +
    7.21 +a:visited
    7.22 +{
    7.23 +	color: gray;
    7.24 +}
    7.25 +
    7.26 +table.index td
    7.27 +{
    7.28 +	padding-right: 15px;
    7.29 +}
    7.30 +
    7.31 +div.index_date
    7.32 +{
    7.33 +	font-size: small;
    7.34 +	color: #666;
    7.35 +}
    7.36 +
    7.37 +div.email
    7.38 +{
    7.39 +	font-size: small;
    7.40 +}
    7.41 +
    7.42 +div.debug
    7.43 +{
    7.44 +	font-family: monospace;
    7.45 +	font-size: x-small;
    7.46 +}
    7.47 +
    7.48 +h1
    7.49 +{
    7.50 +	font-family: "sans-serif";
    7.51 +}
    7.52 +
    7.53 +h2
    7.54 +{
    7.55 +	font-family: "sans-serif";
    7.56 +}
    7.57 +
    7.58 +a.z1
    7.59 +{
    7.60 +	color: #A31;
    7.61 +}
    7.62 +
    7.63 +a.z2
    7.64 +{
    7.65 +	color: #851;
    7.66 +}
    7.67 +
    7.68 +a.z3
    7.69 +{
    7.70 +	color: #833;
    7.71 +}
    7.72 +
    7.73 +p:first-line
    7.74 +{
    7.75 +	font-weight: bold;
    7.76 +}
    7.77 +
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/myrss2/tarball.sh	Sun May 24 00:22:05 2020 -0700
     8.3 @@ -0,0 +1,12 @@
     8.4 +#!/bin/sh
     8.5 +
     8.6 +FILES="
     8.7 +  myrss/Dockerfile
     8.8 +  myrss/.dockerignore
     8.9 +  myrss/FEEDS
    8.10 +  myrss/myrss_app.py
    8.11 +  myrss/myrss_flask.py
    8.12 +  myrss/requirements.pip
    8.13 +  myrss/static/index.css
    8.14 +"
    8.15 +tar -czvf myrss.tar.gz ${FILES}