Mercurial > hg > index.fcgi > www > www-1
changeset 108:cffd95813b82
add myrss2
author | paulo |
---|---|
date | Sun, 24 May 2020 00:22:05 -0700 |
parents | 24a967efbf3e |
children | a24807036601 |
files | myrss2/.dockerignore myrss2/Dockerfile myrss2/FEEDS myrss2/myrss_app.py myrss2/myrss_flask.py myrss2/requirements.pip myrss2/static/index.css myrss2/tarball.sh |
diffstat | 8 files changed, 471 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/myrss2/.dockerignore Sun May 24 00:22:05 2020 -0700 1.3 @@ -0,0 +1,6 @@ 1.4 +Dockerfile 1.5 +README.md 1.6 +*.pyc 1.7 +*.pyo 1.8 +*.pyd 1.9 +__pycache__ 1.10 \ No newline at end of file
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/myrss2/Dockerfile Sun May 24 00:22:05 2020 -0700 2.3 @@ -0,0 +1,17 @@ 2.4 +# Use the official lightweight Python image. 2.5 +# https://hub.docker.com/_/python 2.6 +FROM python:3.6-slim 2.7 + 2.8 +# Copy local code to the container image. 2.9 +ENV APP_HOME /app 2.10 +WORKDIR $APP_HOME 2.11 +COPY . ./ 2.12 + 2.13 +# Install production dependencies. 2.14 +RUN pip install -r requirements.pip 2.15 + 2.16 +# Run the web service on container startup. Here we use the gunicorn 2.17 +# webserver, with one worker process and 8 threads. 2.18 +# For environments with multiple CPU cores, increase the number of workers 2.19 +# to be equal to the cores available. 2.20 +CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 myrss_flask:flask_app
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/myrss2/FEEDS Sun May 24 00:22:05 2020 -0700 3.3 @@ -0,0 +1,43 @@ 3.4 +https://news.google.com/news/rss/?gl=US&ned=us&hl=en 3.5 +https://news.google.com/news/rss/headlines/section/topic/BUSINESS?ned=us&hl=en 3.6 +http://ep01.epimg.net/rss/elpais/portada.xml 3.7 +http://www.sfgate.com/rss/feeds/news_pageone.xml 3.8 +http://www.sfgate.com/bayarea/feed/Bay-Area-News-429.php 3.9 +http://www.weatherwest.com/feed 3.10 +http://sports.yahoo.com/top/rss.xml 3.11 +http://rss.slashdot.org/Slashdot/slashdot 3.12 +http://www.theverge.com/rss/index.xml 3.13 +http://www.reddit.com/.rss 3.14 +http://feeds.feedburner.com/Metafilter 3.15 +http://feeds.feedburner.com/AskMetafilter 3.16 +http://www.vice.com/rss 3.17 +http://feeds.slate.com/slate 3.18 +http://feeds.feedburner.com/TheAtlantic 3.19 +http://aeon.co/magazine/feed/ 3.20 +http://nautil.us/rss/all 3.21 +https://api.quantamagazine.org/feed/ 3.22 +http://undark.org/feed/ 3.23 +http://priceonomics.com/latest.rss 3.24 +https://harpers.org/feed/ 3.25 +http://roadsandkingdoms.com/feed/ 3.26 +http://thebaffler.com/feed 3.27 +http://www.edge.org/feed 3.28 +http://quillette.com/feed/ 3.29 +http://thebrowser.com/feed 3.30 +http://longform.org/feed/ 3.31 +http://longreads.com/rss/ 3.32 +https://www.project-syndicate.org/rss 3.33 +http://lifehacker.com/rss 3.34 +http://bikesnobnyc.blogspot.com/feeds/posts/default 3.35 +http://feeds.feedburner.com/bikehugger 3.36 +http://jalopnik.com/rss 3.37 +https://www.rideapart.com/rss/articles/all/ 3.38 +http://feeds2.feedburner.com/Bikeexif 3.39 +http://feeds2.feedburner.com/Cycleexif 3.40 +http://www.xkcd.com/rss.xml 3.41 +http://feeds.kottke.org/main 3.42 +http://feeds.feedburner.com/OpenCulture 3.43 +http://feeds.feedburner.com/shorpy?q=rss.xml 3.44 +http://feeds.feedburner.com/codinghorror 3.45 +https://danielmiessler.com/feed/ 3.46 +http://syndication.thedailywtf.com/TheDailyWtf
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/myrss2/myrss_app.py Sun May 24 00:22:05 2020 -0700 4.3 @@ -0,0 +1,299 @@ 4.4 +import io 4.5 +import os 4.6 +import sys 4.7 +import re 4.8 +import urllib.request 4.9 +import urllib.error 4.10 +import threading 4.11 +import queue 4.12 +import datetime 4.13 +import time 4.14 +import traceback 4.15 + 4.16 +import logging 4.17 +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") 4.18 +logging.basicConfig( 4.19 + level=getattr(logging, LOG_LEVEL), 4.20 + format="%(asctime)s %(levelname)-8s %(message)s", 4.21 +) 4.22 + 4.23 +import xml.etree.ElementTree 4.24 +import html 4.25 + 4.26 +from html3.html3 import HTML 4.27 + 4.28 + 4.29 +FEEDS_FILE = "FEEDS" 4.30 +CACHE_HTML_FILE = "__cache__.html" 4.31 + 4.32 +CACHE_LIFE = 1200 # [seconds] 4.33 +MAX_ITEMS = 50 4.34 +MAX_LINK_Z = 4 4.35 +MAX_THREADS = 20 4.36 +URLOPEN_TIMEOUT = 10 # [seconds] 4.37 + 4.38 + 4.39 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 4.40 + 4.41 +def _parse_root_tag(root_tag): 4.42 + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 4.43 + 4.44 + if re_match is None: 4.45 + return (None, None) 4.46 + else: 4.47 + return re_match.group(2, 3) 4.48 + 4.49 + 4.50 +def _strip_if_not_none(txt): 4.51 + return txt.strip() if txt is not None else '' 4.52 + 4.53 + 4.54 +def _go_rss(elementTree): 4.55 + title = _strip_if_not_none(elementTree.find("channel/title").text) 4.56 + link = elementTree.find("channel/link").text 4.57 + 4.58 + items = [] 4.59 + 4.60 + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 4.61 + it_title = _strip_if_not_none(i.find("title").text) 4.62 + it_link = i.find("link").text 4.63 + 4.64 + items.append((it_title, it_link)) 4.65 + 4.66 + return (title, link, items) 4.67 + 4.68 + 4.69 +def _go_atom(elementTree): 4.70 + ns = "http://www.w3.org/2005/Atom" 4.71 + 4.72 + title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) 4.73 + link = '' 4.74 + 4.75 + links = elementTree.findall("{%s}link" % ns) 4.76 + for i in links: 4.77 + if len(links) == 1 or i.get("rel") == "alternate": 4.78 + link = i.get("href") 4.79 + break 4.80 + 4.81 + items = [] 4.82 + 4.83 + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 4.84 + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) 4.85 + it_link = '' 4.86 + 4.87 + it_links = i.findall("{%s}link" % ns) 4.88 + for j in it_links: 4.89 + if len(it_links) == 1 or j.get("rel") == "alternate": 4.90 + it_link = j.get("href") 4.91 + break 4.92 + 4.93 + items.append((it_title, it_link)) 4.94 + 4.95 + return (title, link, items) 4.96 + 4.97 + 4.98 +def _go_purl_rss(elementTree): 4.99 + ns = "http://purl.org/rss/1.0/" 4.100 + 4.101 + title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text) 4.102 + link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text 4.103 + 4.104 + items = [] 4.105 + 4.106 + for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]: 4.107 + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) 4.108 + it_link = i.find("{%s}link" % ns).text 4.109 + 4.110 + items.append((it_title, it_link)) 4.111 + 4.112 + return (title, link, items) 4.113 + 4.114 + 4.115 +_STRIP_HTML_RE = re.compile(r"<.*?>") 4.116 + 4.117 +def _strip_html(txt): 4.118 + return html.unescape(_STRIP_HTML_RE.sub('', txt)) 4.119 + 4.120 + 4.121 +def _to_html(dtnow, docstruct): 4.122 + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 4.123 + page_title = "myrss -- %s" % datetime_str 4.124 + 4.125 + root = HTML("html") 4.126 + 4.127 + header = root.head 4.128 + header.meta(name="viewport", content="width=device-width, initial-scale=1") 4.129 + header.title(page_title) 4.130 + header.link(rel="stylesheet", type="text/css", href="static/index.css") 4.131 + 4.132 + body = root.body 4.133 + body.h1(page_title) 4.134 + 4.135 + link_z = 0 4.136 + 4.137 + for feed in docstruct: 4.138 + if feed is None: 4.139 + continue 4.140 + 4.141 + (title, link, items) = feed 4.142 + 4.143 + logging.debug("title: %s", title) 4.144 + body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 4.145 + link_z += 1 4.146 + p = body.p 4.147 + 4.148 + for (i, (it_title, it_link)) in enumerate(items): 4.149 + if i > 0: 4.150 + p += " - " 4.151 + 4.152 + if not it_title: 4.153 + it_title = "(missing title)" 4.154 + if it_link is not None: 4.155 + p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 4.156 + else: 4.157 + p += _strip_html(it_title) 4.158 + 4.159 + link_z += 1 4.160 + 4.161 + dtdelta = datetime.datetime.now() - dtnow 4.162 + root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") 4.163 + 4.164 + return str(root) 4.165 + 4.166 + 4.167 +def _fetch_url(url): 4.168 + try: 4.169 + logging.info("processing %s" % url) 4.170 + feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) 4.171 + except urllib.error.HTTPError as e: 4.172 + logging.info("(%s) %s" % (url, e)) 4.173 + return None 4.174 + 4.175 + return str(feed.read(), encoding="utf-8") 4.176 + 4.177 + 4.178 +def _filter_feed(feed): 4.179 + ret = feed 4.180 + 4.181 + filter_out = ["\x16"] 4.182 + for i in filter_out: 4.183 + ret = ret.replace(i, "") 4.184 + 4.185 + return ret 4.186 + 4.187 + 4.188 +def _process_feed(feed): 4.189 + ret = None 4.190 + 4.191 + feed_sio = io.StringIO(feed) 4.192 + elementTree = xml.etree.ElementTree.parse(feed_sio) 4.193 + root = elementTree.getroot() 4.194 + 4.195 + parsed_root_tag = _parse_root_tag(root.tag) 4.196 + 4.197 + if parsed_root_tag == (None, "rss"): 4.198 + version = float(root.get("version", 0.0)) 4.199 + if version >= 2.0: 4.200 + ret = _go_rss(elementTree) 4.201 + else: 4.202 + raise NotImplementedError("Unsupported rss version") 4.203 + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): 4.204 + ret = _go_atom(elementTree) 4.205 + elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"): 4.206 + ret = _go_purl_rss(elementTree) 4.207 + else: 4.208 + raise NotImplementedError("Unknown root tag") 4.209 + 4.210 + return ret 4.211 + 4.212 + 4.213 +class WorkerThread(threading.Thread): 4.214 + def __init__(self, *args, **kwargs): 4.215 + self._input_queue = kwargs.pop("input_queue") 4.216 + self._output_queue = kwargs.pop("output_queue") 4.217 + threading.Thread.__init__(self, *args, **kwargs) 4.218 + self.daemon = True 4.219 + 4.220 + def run(self): 4.221 + while True: 4.222 + (idx, url) = self._input_queue.get() 4.223 + docfeed = None 4.224 + try: 4.225 + feed = _fetch_url(url) 4.226 + if feed is not None: 4.227 + docfeed = _process_feed(_filter_feed(feed)) 4.228 + except Exception as e: 4.229 + logging.info("(%s) exception: (%s) %s" % (url, type(e), e)) 4.230 + self._output_queue.put((idx, docfeed)) 4.231 + 4.232 + 4.233 +def main(input_queue, output_queue, lock): 4.234 + ret = '' 4.235 + 4.236 + with lock: 4.237 + logging.debug("main() started") 4.238 + epoch_now = time.time() 4.239 + dtnow = datetime.datetime.fromtimestamp(epoch_now) 4.240 + 4.241 + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 4.242 + with open(CACHE_HTML_FILE) as cache_html_file: 4.243 + ret = cache_html_file.read() 4.244 + 4.245 + else: 4.246 + with open(FEEDS_FILE) as feeds_file: 4.247 + feedlines = feeds_file.readlines() 4.248 + 4.249 + docstruct = [None]*len(feedlines) 4.250 + num_input = 0 4.251 + for (i, l) in enumerate(feedlines): 4.252 + if l[0] != '#': 4.253 + l = l.strip() 4.254 + input_queue.put((i, l)) 4.255 + num_input += 1 4.256 + 4.257 + for _ in range(num_input): 4.258 + (idx, docfeed) = output_queue.get() 4.259 + docstruct[idx] = docfeed 4.260 + 4.261 + ret = _to_html(dtnow, docstruct) 4.262 + 4.263 + with open(CACHE_HTML_FILE, 'w') as cache_html_file: 4.264 + cache_html_file.write(ret) 4.265 + logging.debug("main() ended") 4.266 + 4.267 + return ret 4.268 + 4.269 + 4.270 +class MyRssApp: 4.271 + def __init__(self): 4.272 + logging.debug("MyRssApp.__init__() called") 4.273 + self._iq = queue.Queue(MAX_THREADS) 4.274 + self._oq = queue.Queue(MAX_THREADS) 4.275 + self._main_lock = threading.Lock() 4.276 + 4.277 + for i in range(MAX_THREADS): 4.278 + logging.debug("Starting thread: %d" % i) 4.279 + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 4.280 + 4.281 + # Raw WSGI 4.282 + def __call__(self, environ, start_response): 4.283 + response_code = "500 Internal Server Error" 4.284 + response_type = "text/plain; charset=UTF-8" 4.285 + 4.286 + try: 4.287 + response_body = main(self._iq, self._oq, self._main_lock) 4.288 + response_code = "200 OK" 4.289 + response_type = "text/html; charset=UTF-8" 4.290 + except: 4.291 + response_body = traceback.format_exc() 4.292 + 4.293 + response_headers = [ 4.294 + ("Content-Type", response_type), 4.295 + ("Content-Length", str(len(response_body))), 4.296 + ] 4.297 + start_response(response_code, response_headers) 4.298 + 4.299 + return [bytes(response_body, encoding="utf-8")] 4.300 + 4.301 + def call(self): 4.302 + return main(self._iq, self._oq, self._main_lock)
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/myrss2/myrss_flask.py Sun May 24 00:22:05 2020 -0700 5.3 @@ -0,0 +1,12 @@ 5.4 +from flask import Flask 5.5 + 5.6 +import myrss_app 5.7 + 5.8 + 5.9 +flask_app = Flask(__name__) 5.10 +my_app = myrss_app.MyRssApp() 5.11 + 5.12 + 5.13 +@flask_app.route("/") 5.14 +def index(): 5.15 + return my_app.call()
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/myrss2/requirements.pip Sun May 24 00:22:05 2020 -0700 6.3 @@ -0,0 +1,8 @@ 6.4 +click==7.1.2 6.5 +Flask==1.1.2 6.6 +gunicorn==20.0.4 6.7 +html3==1.18 6.8 +itsdangerous==1.1.0 6.9 +Jinja2==2.11.2 6.10 +MarkupSafe==1.1.1 6.11 +Werkzeug==1.0.1
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/myrss2/static/index.css Sun May 24 00:22:05 2020 -0700 7.3 @@ -0,0 +1,74 @@ 7.4 +body 7.5 +{ 7.6 + background-color: #111; 7.7 + color: #ccc; 7.8 +} 7.9 + 7.10 +a:link 7.11 +{ 7.12 + color: #831; 7.13 + text-decoration: none; 7.14 +} 7.15 + 7.16 +a:hover 7.17 +{ 7.18 + text-decoration: underline; 7.19 +} 7.20 + 7.21 +a:visited 7.22 +{ 7.23 + color: gray; 7.24 +} 7.25 + 7.26 +table.index td 7.27 +{ 7.28 + padding-right: 15px; 7.29 +} 7.30 + 7.31 +div.index_date 7.32 +{ 7.33 + font-size: small; 7.34 + color: #666; 7.35 +} 7.36 + 7.37 +div.email 7.38 +{ 7.39 + font-size: small; 7.40 +} 7.41 + 7.42 +div.debug 7.43 +{ 7.44 + font-family: monospace; 7.45 + font-size: x-small; 7.46 +} 7.47 + 7.48 +h1 7.49 +{ 7.50 + font-family: "sans-serif"; 7.51 +} 7.52 + 7.53 +h2 7.54 +{ 7.55 + font-family: "sans-serif"; 7.56 +} 7.57 + 7.58 +a.z1 7.59 +{ 7.60 + color: #A31; 7.61 +} 7.62 + 7.63 +a.z2 7.64 +{ 7.65 + color: #851; 7.66 +} 7.67 + 7.68 +a.z3 7.69 +{ 7.70 + color: #833; 7.71 +} 7.72 + 7.73 +p:first-line 7.74 +{ 7.75 + font-weight: bold; 7.76 +} 7.77 +
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/myrss2/tarball.sh Sun May 24 00:22:05 2020 -0700 8.3 @@ -0,0 +1,12 @@ 8.4 +#!/bin/sh 8.5 + 8.6 +FILES=" 8.7 + myrss/Dockerfile 8.8 + myrss/.dockerignore 8.9 + myrss/FEEDS 8.10 + myrss/myrss_app.py 8.11 + myrss/myrss_flask.py 8.12 + myrss/requirements.pip 8.13 + myrss/static/index.css 8.14 +" 8.15 +tar -czvf myrss.tar.gz ${FILES}