Mercurial > hg > index.fcgi > www > www-1
diff myrss2/myrss_app.py @ 108:cffd95813b82
add myrss2
author | paulo |
---|---|
date | Sun, 24 May 2020 00:22:05 -0700 |
parents | |
children | 1a5c0fc5627a |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/myrss2/myrss_app.py Sun May 24 00:22:05 2020 -0700 1.3 @@ -0,0 +1,299 @@ 1.4 +import io 1.5 +import os 1.6 +import sys 1.7 +import re 1.8 +import urllib.request 1.9 +import urllib.error 1.10 +import threading 1.11 +import queue 1.12 +import datetime 1.13 +import time 1.14 +import traceback 1.15 + 1.16 +import logging 1.17 +LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") 1.18 +logging.basicConfig( 1.19 + level=getattr(logging, LOG_LEVEL), 1.20 + format="%(asctime)s %(levelname)-8s %(message)s", 1.21 +) 1.22 + 1.23 +import xml.etree.ElementTree 1.24 +import html 1.25 + 1.26 +from html3.html3 import HTML 1.27 + 1.28 + 1.29 +FEEDS_FILE = "FEEDS" 1.30 +CACHE_HTML_FILE = "__cache__.html" 1.31 + 1.32 +CACHE_LIFE = 1200 # [seconds] 1.33 +MAX_ITEMS = 50 1.34 +MAX_LINK_Z = 4 1.35 +MAX_THREADS = 20 1.36 +URLOPEN_TIMEOUT = 10 # [seconds] 1.37 + 1.38 + 1.39 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 1.40 + 1.41 +def _parse_root_tag(root_tag): 1.42 + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 1.43 + 1.44 + if re_match is None: 1.45 + return (None, None) 1.46 + else: 1.47 + return re_match.group(2, 3) 1.48 + 1.49 + 1.50 +def _strip_if_not_none(txt): 1.51 + return txt.strip() if txt is not None else '' 1.52 + 1.53 + 1.54 +def _go_rss(elementTree): 1.55 + title = _strip_if_not_none(elementTree.find("channel/title").text) 1.56 + link = elementTree.find("channel/link").text 1.57 + 1.58 + items = [] 1.59 + 1.60 + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 1.61 + it_title = _strip_if_not_none(i.find("title").text) 1.62 + it_link = i.find("link").text 1.63 + 1.64 + items.append((it_title, it_link)) 1.65 + 1.66 + return (title, link, items) 1.67 + 1.68 + 1.69 +def _go_atom(elementTree): 1.70 + ns = "http://www.w3.org/2005/Atom" 1.71 + 1.72 + title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) 1.73 + link = '' 1.74 + 1.75 + links = elementTree.findall("{%s}link" % ns) 1.76 + for i in links: 1.77 + if len(links) == 1 or i.get("rel") == "alternate": 1.78 + link = i.get("href") 1.79 + break 1.80 + 1.81 + items = [] 1.82 + 1.83 + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 1.84 + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) 1.85 + it_link = '' 1.86 + 1.87 + it_links = i.findall("{%s}link" % ns) 1.88 + for j in it_links: 1.89 + if len(it_links) == 1 or j.get("rel") == "alternate": 1.90 + it_link = j.get("href") 1.91 + break 1.92 + 1.93 + items.append((it_title, it_link)) 1.94 + 1.95 + return (title, link, items) 1.96 + 1.97 + 1.98 +def _go_purl_rss(elementTree): 1.99 + ns = "http://purl.org/rss/1.0/" 1.100 + 1.101 + title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text) 1.102 + link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text 1.103 + 1.104 + items = [] 1.105 + 1.106 + for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]: 1.107 + it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) 1.108 + it_link = i.find("{%s}link" % ns).text 1.109 + 1.110 + items.append((it_title, it_link)) 1.111 + 1.112 + return (title, link, items) 1.113 + 1.114 + 1.115 +_STRIP_HTML_RE = re.compile(r"<.*?>") 1.116 + 1.117 +def _strip_html(txt): 1.118 + return html.unescape(_STRIP_HTML_RE.sub('', txt)) 1.119 + 1.120 + 1.121 +def _to_html(dtnow, docstruct): 1.122 + datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 1.123 + page_title = "myrss -- %s" % datetime_str 1.124 + 1.125 + root = HTML("html") 1.126 + 1.127 + header = root.head 1.128 + header.meta(name="viewport", content="width=device-width, initial-scale=1") 1.129 + header.title(page_title) 1.130 + header.link(rel="stylesheet", type="text/css", href="static/index.css") 1.131 + 1.132 + body = root.body 1.133 + body.h1(page_title) 1.134 + 1.135 + link_z = 0 1.136 + 1.137 + for feed in docstruct: 1.138 + if feed is None: 1.139 + continue 1.140 + 1.141 + (title, link, items) = feed 1.142 + 1.143 + logging.debug("title: %s", title) 1.144 + body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.145 + link_z += 1 1.146 + p = body.p 1.147 + 1.148 + for (i, (it_title, it_link)) in enumerate(items): 1.149 + if i > 0: 1.150 + p += " - " 1.151 + 1.152 + if not it_title: 1.153 + it_title = "(missing title)" 1.154 + if it_link is not None: 1.155 + p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.156 + else: 1.157 + p += _strip_html(it_title) 1.158 + 1.159 + link_z += 1 1.160 + 1.161 + dtdelta = datetime.datetime.now() - dtnow 1.162 + root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") 1.163 + 1.164 + return str(root) 1.165 + 1.166 + 1.167 +def _fetch_url(url): 1.168 + try: 1.169 + logging.info("processing %s" % url) 1.170 + feed = urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 Browser"}), timeout=URLOPEN_TIMEOUT) 1.171 + except urllib.error.HTTPError as e: 1.172 + logging.info("(%s) %s" % (url, e)) 1.173 + return None 1.174 + 1.175 + return str(feed.read(), encoding="utf-8") 1.176 + 1.177 + 1.178 +def _filter_feed(feed): 1.179 + ret = feed 1.180 + 1.181 + filter_out = ["\x16"] 1.182 + for i in filter_out: 1.183 + ret = ret.replace(i, "") 1.184 + 1.185 + return ret 1.186 + 1.187 + 1.188 +def _process_feed(feed): 1.189 + ret = None 1.190 + 1.191 + feed_sio = io.StringIO(feed) 1.192 + elementTree = xml.etree.ElementTree.parse(feed_sio) 1.193 + root = elementTree.getroot() 1.194 + 1.195 + parsed_root_tag = _parse_root_tag(root.tag) 1.196 + 1.197 + if parsed_root_tag == (None, "rss"): 1.198 + version = float(root.get("version", 0.0)) 1.199 + if version >= 2.0: 1.200 + ret = _go_rss(elementTree) 1.201 + else: 1.202 + raise NotImplementedError("Unsupported rss version") 1.203 + elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): 1.204 + ret = _go_atom(elementTree) 1.205 + elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"): 1.206 + ret = _go_purl_rss(elementTree) 1.207 + else: 1.208 + raise NotImplementedError("Unknown root tag") 1.209 + 1.210 + return ret 1.211 + 1.212 + 1.213 +class WorkerThread(threading.Thread): 1.214 + def __init__(self, *args, **kwargs): 1.215 + self._input_queue = kwargs.pop("input_queue") 1.216 + self._output_queue = kwargs.pop("output_queue") 1.217 + threading.Thread.__init__(self, *args, **kwargs) 1.218 + self.daemon = True 1.219 + 1.220 + def run(self): 1.221 + while True: 1.222 + (idx, url) = self._input_queue.get() 1.223 + docfeed = None 1.224 + try: 1.225 + feed = _fetch_url(url) 1.226 + if feed is not None: 1.227 + docfeed = _process_feed(_filter_feed(feed)) 1.228 + except Exception as e: 1.229 + logging.info("(%s) exception: (%s) %s" % (url, type(e), e)) 1.230 + self._output_queue.put((idx, docfeed)) 1.231 + 1.232 + 1.233 +def main(input_queue, output_queue, lock): 1.234 + ret = '' 1.235 + 1.236 + with lock: 1.237 + logging.debug("main() started") 1.238 + epoch_now = time.time() 1.239 + dtnow = datetime.datetime.fromtimestamp(epoch_now) 1.240 + 1.241 + if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): 1.242 + with open(CACHE_HTML_FILE) as cache_html_file: 1.243 + ret = cache_html_file.read() 1.244 + 1.245 + else: 1.246 + with open(FEEDS_FILE) as feeds_file: 1.247 + feedlines = feeds_file.readlines() 1.248 + 1.249 + docstruct = [None]*len(feedlines) 1.250 + num_input = 0 1.251 + for (i, l) in enumerate(feedlines): 1.252 + if l[0] != '#': 1.253 + l = l.strip() 1.254 + input_queue.put((i, l)) 1.255 + num_input += 1 1.256 + 1.257 + for _ in range(num_input): 1.258 + (idx, docfeed) = output_queue.get() 1.259 + docstruct[idx] = docfeed 1.260 + 1.261 + ret = _to_html(dtnow, docstruct) 1.262 + 1.263 + with open(CACHE_HTML_FILE, 'w') as cache_html_file: 1.264 + cache_html_file.write(ret) 1.265 + logging.debug("main() ended") 1.266 + 1.267 + return ret 1.268 + 1.269 + 1.270 +class MyRssApp: 1.271 + def __init__(self): 1.272 + logging.debug("MyRssApp.__init__() called") 1.273 + self._iq = queue.Queue(MAX_THREADS) 1.274 + self._oq = queue.Queue(MAX_THREADS) 1.275 + self._main_lock = threading.Lock() 1.276 + 1.277 + for i in range(MAX_THREADS): 1.278 + logging.debug("Starting thread: %d" % i) 1.279 + WorkerThread(input_queue=self._iq, output_queue=self._oq).start() 1.280 + 1.281 + # Raw WSGI 1.282 + def __call__(self, environ, start_response): 1.283 + response_code = "500 Internal Server Error" 1.284 + response_type = "text/plain; charset=UTF-8" 1.285 + 1.286 + try: 1.287 + response_body = main(self._iq, self._oq, self._main_lock) 1.288 + response_code = "200 OK" 1.289 + response_type = "text/html; charset=UTF-8" 1.290 + except: 1.291 + response_body = traceback.format_exc() 1.292 + 1.293 + response_headers = [ 1.294 + ("Content-Type", response_type), 1.295 + ("Content-Length", str(len(response_body))), 1.296 + ] 1.297 + start_response(response_code, response_headers) 1.298 + 1.299 + return [bytes(response_body, encoding="utf-8")] 1.300 + 1.301 + def call(self): 1.302 + return main(self._iq, self._oq, self._main_lock)