paulo@39: import os paulo@40: import sys paulo@39: import re paulo@40: import urllib2 paulo@40: import threading paulo@40: import Queue paulo@41: import datetime paulo@41: import time paulo@42: import logging paulo@42: logging.basicConfig(level=logging.INFO) paulo@39: paulo@39: import html paulo@39: import xml.etree.ElementTree paulo@39: paulo@39: paulo@41: FEEDS_FILE = "FEEDS" paulo@41: CACHE_HTML_FILE = "__cache__.html" paulo@41: paulo@41: #CACHE_LIFE = 1200 # [seconds] paulo@41: CACHE_LIFE = 30 # [seconds] paulo@39: MAX_ITEMS = 30 paulo@39: MAX_LINK_Z = 4 paulo@40: MAX_THREADS = 20 paulo@39: paulo@39: paulo@39: _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") paulo@39: paulo@39: def _parse_root_tag(root_tag): paulo@39: re_match = _PARSE_ROOT_TAG_RE.match(root_tag) paulo@39: paulo@39: if re_match is None: paulo@39: return (None, None) paulo@39: else: paulo@39: return re_match.group(2, 3) paulo@39: paulo@39: paulo@39: def _go_rss(elementTree): paulo@39: title = elementTree.find("channel/title").text.strip() paulo@39: link = elementTree.find("channel/link").text paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("channel/item")[:MAX_ITEMS]: paulo@39: it_title = i.find("title").text.strip() paulo@39: it_link = i.find("link").text paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@39: def _go_atom(elementTree): paulo@39: ns = "http://www.w3.org/2005/Atom" paulo@39: paulo@39: title = elementTree.find("{%s}title" % ns).text.strip() paulo@39: link = '' paulo@39: paulo@39: for i in elementTree.findall("{%s}link" % ns): paulo@39: if i.get("type") == "text/html" and i.get("rel") == "alternate": paulo@39: link = i.get("href") paulo@39: break paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: paulo@39: it_title = i.find("{%s}title" % ns).text.strip() paulo@39: it_link = '' paulo@39: paulo@39: for j in i.findall("{%s}link" % ns): paulo@39: if j.get("type") == "text/html" and j.get("rel") == "alternate": paulo@39: it_link = j.get("href") paulo@39: break paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@41: def _to_html(dtnow, docstruct): paulo@41: datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") paulo@41: page_title = "myrss -- %s" % datetime_str paulo@41: paulo@42: root = html.HTML("html") paulo@39: paulo@39: header = root.header paulo@41: header.title(page_title) paulo@39: header.link(rel="stylesheet", type="text/css", href="index.css") paulo@39: paulo@41: body = root.body paulo@41: body.h1(page_title) paulo@41: paulo@39: link_z = 0 paulo@39: paulo@39: for feed in docstruct: paulo@40: if feed is None: paulo@40: continue paulo@40: paulo@39: (title, link, items) = feed paulo@39: paulo@41: body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@39: link_z += 1 paulo@41: p = body.p paulo@39: paulo@39: for (i, (it_title, it_link)) in enumerate(items): paulo@39: if i > 0: paulo@39: p += " - " paulo@39: paulo@39: p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@39: link_z += 1 paulo@39: paulo@39: return unicode(root).encode("utf-8") paulo@39: paulo@39: paulo@40: def _process_url(url): paulo@40: ret = None paulo@40: paulo@40: try: paulo@42: logging.info("processing %s" % url) paulo@41: feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) paulo@40: except urllib2.HTTPError as e: paulo@42: logging.info("(%s) %s" % (url, e)) paulo@40: return ret paulo@40: paulo@40: elementTree = xml.etree.ElementTree.parse(feed) paulo@40: root = elementTree.getroot() paulo@40: paulo@40: parsed_root_tag = _parse_root_tag(root.tag) paulo@40: paulo@40: if parsed_root_tag == (None, "rss"): paulo@40: version = float(root.get("version", 0.0)) paulo@40: if version >= 2.0: paulo@40: ret = _go_rss(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unsupported rss version") paulo@40: elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): paulo@40: ret = _go_atom(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unknown root tag") paulo@40: paulo@40: return ret paulo@40: paulo@40: paulo@40: class WorkerThread(threading.Thread): paulo@40: def __init__(self, *args, **kwargs): paulo@40: self._input_queue = kwargs.pop("input_queue") paulo@40: self._output_queue = kwargs.pop("output_queue") paulo@40: threading.Thread.__init__(self, *args, **kwargs) paulo@40: self.daemon = True paulo@40: paulo@40: def run(self): paulo@40: while True: paulo@40: (idx, url) = self._input_queue.get() paulo@40: docfeed = None paulo@40: try: paulo@40: docfeed = _process_url(url) paulo@40: except Exception as e: paulo@42: logging.info("(%s) exception: %s" % (url, e)) paulo@40: self._output_queue.put((idx, docfeed)) paulo@40: paulo@40: paulo@42: def main(input_queue, output_queue): paulo@41: ret = '' paulo@41: paulo@41: epoch_now = time.time() paulo@41: dtnow = datetime.datetime.fromtimestamp(epoch_now) paulo@41: paulo@41: if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE): paulo@41: with open(CACHE_HTML_FILE) as cache_html_file: paulo@41: ret = cache_html_file.read() paulo@41: paulo@41: else: paulo@41: with open(FEEDS_FILE) as feeds_file: paulo@41: feedlines = feeds_file.readlines() paulo@41: paulo@41: docstruct = [None]*len(feedlines) paulo@42: num_input = 0 paulo@41: for (i, l) in enumerate(feedlines): paulo@41: if l[0] != '#': paulo@41: l = l.strip() paulo@42: input_queue.put((i, l)) paulo@42: num_input += 1 paulo@41: paulo@42: for _ in range(num_input): paulo@42: (idx, docfeed) = output_queue.get() paulo@42: docstruct[idx] = docfeed paulo@41: paulo@41: ret = _to_html(dtnow, docstruct) paulo@41: paulo@41: with open(CACHE_HTML_FILE, 'w') as cache_html_file: paulo@41: cache_html_file.write(ret) paulo@41: paulo@41: return ret paulo@41: paulo@41: paulo@42: class MyRssApp: paulo@42: def __init__(self): paulo@42: self._iq = Queue.Queue(MAX_THREADS) paulo@42: self._oq = Queue.Queue(MAX_THREADS) paulo@39: paulo@42: for _ in range(MAX_THREADS): paulo@42: WorkerThread(input_queue=self._iq, output_queue=self._oq).start() paulo@42: paulo@42: def __call__(self, environ, start_response): paulo@42: response_body = main(self._iq, self._oq) paulo@42: response_headers = [ paulo@42: ("Content-Type", "text/html"), paulo@42: ("Content-Length", str(len(response_body))), paulo@42: ] paulo@42: start_response("200 OK", response_headers) paulo@42: paulo@42: return [response_body] paulo@42: paulo@42: