paulo@39: import os paulo@40: import sys paulo@39: import re paulo@40: import urllib2 paulo@40: import threading paulo@40: import Queue paulo@39: paulo@39: import html paulo@39: import xml.etree.ElementTree paulo@39: paulo@39: paulo@39: MAX_ITEMS = 30 paulo@39: MAX_LINK_Z = 4 paulo@40: MAX_THREADS = 20 paulo@39: paulo@39: paulo@39: _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") paulo@39: paulo@39: def _parse_root_tag(root_tag): paulo@39: re_match = _PARSE_ROOT_TAG_RE.match(root_tag) paulo@39: paulo@39: if re_match is None: paulo@39: return (None, None) paulo@39: else: paulo@39: return re_match.group(2, 3) paulo@39: paulo@39: paulo@39: def _go_rss(elementTree): paulo@39: title = elementTree.find("channel/title").text.strip() paulo@39: link = elementTree.find("channel/link").text paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("channel/item")[:MAX_ITEMS]: paulo@39: it_title = i.find("title").text.strip() paulo@39: it_link = i.find("link").text paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@39: def _go_atom(elementTree): paulo@39: ns = "http://www.w3.org/2005/Atom" paulo@39: paulo@39: title = elementTree.find("{%s}title" % ns).text.strip() paulo@39: link = '' paulo@39: paulo@39: for i in elementTree.findall("{%s}link" % ns): paulo@39: if i.get("type") == "text/html" and i.get("rel") == "alternate": paulo@39: link = i.get("href") paulo@39: break paulo@39: paulo@39: items = [] paulo@39: paulo@39: for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: paulo@39: it_title = i.find("{%s}title" % ns).text.strip() paulo@39: it_link = '' paulo@39: paulo@39: for j in i.findall("{%s}link" % ns): paulo@39: if j.get("type") == "text/html" and j.get("rel") == "alternate": paulo@39: it_link = j.get("href") paulo@39: break paulo@39: paulo@39: items.append((it_title, it_link)) paulo@39: paulo@39: return (title, link, items) paulo@39: paulo@39: paulo@39: def _to_html(docstruct): paulo@39: root = html.HTML() paulo@39: paulo@39: header = root.header paulo@39: header.title("myrss") paulo@39: header.link(rel="stylesheet", type="text/css", href="index.css") paulo@39: paulo@39: link_z = 0 paulo@39: paulo@39: for feed in docstruct: paulo@40: if feed is None: paulo@40: continue paulo@40: paulo@39: (title, link, items) = feed paulo@39: paulo@39: root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@39: link_z += 1 paulo@39: p = root.p paulo@39: paulo@39: for (i, (it_title, it_link)) in enumerate(items): paulo@39: if i > 0: paulo@39: p += " - " paulo@39: paulo@39: p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) paulo@39: link_z += 1 paulo@39: paulo@39: return unicode(root).encode("utf-8") paulo@39: paulo@39: paulo@40: def _process_url(url): paulo@40: ret = None paulo@40: paulo@40: try: paulo@40: print >> sys.stderr, "--> processing %s" % url paulo@40: feed = urllib2.urlopen(url) paulo@40: except urllib2.HTTPError as e: paulo@40: print >> sys.stderr, "--> (%s) %s" % (url, e) paulo@40: return ret paulo@40: paulo@40: elementTree = xml.etree.ElementTree.parse(feed) paulo@40: root = elementTree.getroot() paulo@40: paulo@40: parsed_root_tag = _parse_root_tag(root.tag) paulo@40: paulo@40: if parsed_root_tag == (None, "rss"): paulo@40: version = float(root.get("version", 0.0)) paulo@40: if version >= 2.0: paulo@40: ret = _go_rss(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unsupported rss version") paulo@40: elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): paulo@40: ret = _go_atom(elementTree) paulo@40: else: paulo@40: raise NotImplementedError("Unknown root tag") paulo@40: paulo@40: return ret paulo@40: paulo@40: paulo@40: class WorkerThread(threading.Thread): paulo@40: def __init__(self, *args, **kwargs): paulo@40: self._input_queue = kwargs.pop("input_queue") paulo@40: self._output_queue = kwargs.pop("output_queue") paulo@40: threading.Thread.__init__(self, *args, **kwargs) paulo@40: self.daemon = True paulo@40: paulo@40: def run(self): paulo@40: while True: paulo@40: (idx, url) = self._input_queue.get() paulo@40: docfeed = None paulo@40: try: paulo@40: docfeed = _process_url(url) paulo@40: except Exception as e: paulo@40: print >> sys.stderr, "--> (%s) exception: %s" % (url, e) paulo@40: self._output_queue.put((idx, docfeed)) paulo@40: self._input_queue.task_done() paulo@40: paulo@40: paulo@39: if __name__ == "__main__": paulo@40: with open("FEEDS") as feeds_file: paulo@40: feedlines = feeds_file.readlines() paulo@39: paulo@40: docstruct = [None]*len(feedlines) paulo@40: iq = Queue.Queue(feedlines) paulo@40: oq = Queue.Queue(feedlines) paulo@39: paulo@40: for _ in range(MAX_THREADS): paulo@40: WorkerThread(input_queue=iq, output_queue=oq).start() paulo@39: paulo@40: for (i, l) in enumerate(feedlines): paulo@40: if l[0] != '#': paulo@40: l = l.strip() paulo@40: iq.put((i, l)) paulo@40: paulo@40: iq.join() paulo@40: paulo@40: while True: paulo@40: try: paulo@40: (idx, docfeed) = oq.get_nowait() paulo@40: docstruct[idx] = docfeed paulo@40: except Queue.Empty: paulo@40: break paulo@40: paulo@40: print _to_html(docstruct)