paulo@39: import os paulo@40: import sys paulo@39: import re paulo@40: import urllib2 paulo@40: import threading paulo@40: import Queue paulo@41: import datetime paulo@41: import time paulo@70: import traceback paulo@47: paulo@42: import logging paulo@68: logging.basicConfig( paulo@70: #level=logging.DEBUG, paulo@68: #filename="_LOG", paulo@68: #format="%(asctime)s %(levelname)-8s %(message)s", paulo@68: ) paulo@39: paulo@47: import xml.etree.ElementTree paulo@47: import HTMLParser paulo@47: paulo@39: import html paulo@39: paulo@39: paulo@41: FEEDS_FILE = "FEEDS" paulo@41: CACHE_HTML_FILE = "__cache__.html" paulo@41: paulo@44: CACHE_LIFE = 1200 # [seconds] paulo@47: MAX_ITEMS = 50 paulo@39: MAX_LINK_Z = 4 paulo@40: MAX_THREADS = 20 paulo@46: URLOPEN_TIMEOUT = 60 # [seconds] paulo@39: paulo@39: paulo@39: _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") paulo@39: paulo@39: def _parse_root_tag(root_tag): paulo@39: re_match = _PARSE_ROOT_TAG_RE.match(root_tag) paulo@39: paulo@39: if re_match is None: paulo@39: return (None, None) paulo@39: else: paulo@39: return re_match.group(2, 3) paulo@39: paulo@39: paulo@47: def _strip_if_not_none(txt): paulo@47: return txt.strip() if txt is not None else '' paulo@47: paulo@47: paulo@39: def _go_rss(elementTree): paulo@47: title = _strip_if_not_none(elementTree.find("channel/title").text) paulo@39: link = elementTree.find("channel/link").text paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("channel/item")[:MAX_ITEMS]: paulo@47: it_title = _strip_if_not_none(i.find("title").text) paulo@39: it_link = i.find("link").text paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@39: def _go_atom(elementTree): paulo@39: ns = "http://www.w3.org/2005/Atom" paulo@39: paulo@47: title = _strip_if_not_none(elementTree.find("{%s}title" % ns).text) paulo@39: link = '' paulo@39: paulo@39: for i in elementTree.findall("{%s}link" % ns): paulo@39: if i.get("type") == "text/html" and i.get("rel") == "alternate": paulo@39: link = i.get("href") paulo@39: break paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: paulo@47: it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) paulo@39: it_link = '' paulo@39: paulo@39: for j in i.findall("{%s}link" % ns): paulo@39: if j.get("type") == "text/html" and j.get("rel") == "alternate": paulo@39: it_link = j.get("href") paulo@39: break paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@69: def _go_purl_rss(elementTree): paulo@69: ns = "http://purl.org/rss/1.0/" paulo@69: paulo@69: title = _strip_if_not_none(elementTree.find("{%s}channel/{%s}title" % (ns, ns)).text) paulo@69: link = elementTree.find("{%s}channel/{%s}link" % (ns, ns)).text paulo@69: paulo@69: items = [] paulo@69: paulo@69: for i in elementTree.findall("{%s}item" % ns)[:MAX_ITEMS]: paulo@69: it_title = _strip_if_not_none(i.find("{%s}title" % ns).text) paulo@69: it_link = i.find("{%s}link" % ns).text paulo@69: paulo@69: items.append((it_title, it_link)) paulo@69: paulo@69: return (title, link, items) paulo@69: paulo@69: paulo@47: _STRIP_HTML_RE = re.compile(r"<.*?>") paulo@47: _htmlParser = HTMLParser.HTMLParser() paulo@47: paulo@47: def _strip_html(txt): paulo@47: return _htmlParser.unescape(_STRIP_HTML_RE.sub('', txt)) paulo@47: paulo@47: paulo@41: def _to_html(dtnow, docstruct): paulo@41: datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") paulo@41: page_title = "myrss -- %s" % datetime_str paulo@41: paulo@42: root = html.HTML("html") paulo@39: paulo@39: header = root.header paulo@41: header.title(page_title) paulo@39: header.link(rel="stylesheet", type="text/css", href="index.css") paulo@39: paulo@41: body = root.body paulo@41: body.h1(page_title) paulo@41: paulo@39: link_z = 0 paulo@39: paulo@39: for feed in docstruct: paulo@40: if feed is None: paulo@40: continue paulo@40: paulo@39: (title, link, items) = feed paulo@39: paulo@47: body.h2.a(_strip_html(title), href=link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@39: link_z += 1 paulo@41: p = body.p paulo@39: paulo@39: for (i, (it_title, it_link)) in enumerate(items): paulo@39: if i > 0: paulo@39: p += " - " paulo@39: paulo@72: if not it_title: paulo@72: it_title = "(missing title)" paulo@72: if it_link is not None: paulo@72: p.a(_strip_html(it_title), href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@72: else: paulo@72: p += _strip_html(it_title) paulo@72: paulo@39: link_z += 1 paulo@39: paulo@46: dtdelta = datetime.datetime.now() - dtnow paulo@46: root.div("%.3f" % (dtdelta.days*86400 + dtdelta.seconds + dtdelta.microseconds/1e6), klass="debug") paulo@46: paulo@39: return unicode(root).encode("utf-8") paulo@39: paulo@39: paulo@47: def _fetch_url(url): paulo@40: try: paulo@42: logging.info("processing %s" % url) paulo@46: feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}), timeout=URLOPEN_TIMEOUT) paulo@40: except urllib2.HTTPError as e: paulo@42: logging.info("(%s) %s" % (url, e)) paulo@47: return None paulo@47: paulo@47: return feed paulo@47: paulo@47: paulo@47: def _process_feed(feed): paulo@47: ret = None paulo@40: paulo@40: elementTree = xml.etree.ElementTree.parse(feed) paulo@40: root = elementTree.getroot() paulo@40: paulo@40: parsed_root_tag = _parse_root_tag(root.tag) paulo@40: paulo@40: if parsed_root_tag == (None, "rss"): paulo@40: version = float(root.get("version", 0.0)) paulo@40: if version >= 2.0: paulo@40: ret = _go_rss(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unsupported rss version") paulo@40: elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): paulo@40: ret = _go_atom(elementTree) paulo@69: elif parsed_root_tag == ("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "RDF"): paulo@69: ret = _go_purl_rss(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unknown root tag") paulo@40: paulo@40: return ret paulo@40: paulo@40: paulo@40: class WorkerThread(threading.Thread): paulo@40: def __init__(self, *args, **kwargs): paulo@40: self._input_queue = kwargs.pop("input_queue") paulo@40: self._output_queue = kwargs.pop("output_queue") paulo@40: threading.Thread.__init__(self, *args, **kwargs) paulo@40: self.daemon = True paulo@40: paulo@40: def run(self): paulo@40: while True: paulo@40: (idx, url) = self._input_queue.get() paulo@40: docfeed = None paulo@40: try: paulo@47: feed = _fetch_url(url) paulo@47: if feed is not None: paulo@47: docfeed = _process_feed(feed) paulo@40: except Exception as e: paulo@42: logging.info("(%s) exception: %s" % (url, e)) paulo@40: self._output_queue.put((idx, docfeed)) paulo@40: paulo@40: paulo@44: def main(input_queue, output_queue, lock): paulo@41: ret = '' paulo@41: paulo@44: with lock: paulo@44: epoch_now = time.time() paulo@44: dtnow = datetime.datetime.fromtimestamp(epoch_now) paulo@41: paulo@44: if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): paulo@44: with open(CACHE_HTML_FILE) as cache_html_file: paulo@44: ret = cache_html_file.read() paulo@41: paulo@44: else: paulo@44: with open(FEEDS_FILE) as feeds_file: paulo@44: feedlines = feeds_file.readlines() paulo@41: paulo@44: docstruct = [None]*len(feedlines) paulo@44: num_input = 0 paulo@44: for (i, l) in enumerate(feedlines): paulo@44: if l[0] != '#': paulo@44: l = l.strip() paulo@44: input_queue.put((i, l)) paulo@44: num_input += 1 paulo@41: paulo@44: for _ in range(num_input): paulo@44: (idx, docfeed) = output_queue.get() paulo@44: docstruct[idx] = docfeed paulo@41: paulo@44: ret = _to_html(dtnow, docstruct) paulo@41: paulo@44: with open(CACHE_HTML_FILE, 'w') as cache_html_file: paulo@44: cache_html_file.write(ret) paulo@41: paulo@41: return ret paulo@41: paulo@41: paulo@42: class MyRssApp: paulo@42: def __init__(self): paulo@42: self._iq = Queue.Queue(MAX_THREADS) paulo@42: self._oq = Queue.Queue(MAX_THREADS) paulo@44: self._main_lock = threading.Lock() paulo@39: paulo@42: for _ in range(MAX_THREADS): paulo@42: WorkerThread(input_queue=self._iq, output_queue=self._oq).start() paulo@42: paulo@42: def __call__(self, environ, start_response): paulo@70: response_code = "500 Internal Server Error" paulo@70: response_type = "text/plain; charset=UTF-8" paulo@70: paulo@70: try: paulo@70: response_body = main(self._iq, self._oq, self._main_lock) paulo@70: response_code = "200 OK" paulo@70: response_type = "text/html; charset=UTF-8" paulo@70: except: paulo@70: response_body = traceback.format_exc() paulo@70: paulo@42: response_headers = [ paulo@70: ("Content-Type", response_type), paulo@42: ("Content-Length", str(len(response_body))), paulo@42: ] paulo@70: start_response(response_code, response_headers) paulo@42: paulo@42: return [response_body]