view myrss/myrss_parser.py @ 40:62464a0034d1

add threaded url opener
author paulo
date Thu, 31 Jan 2013 02:19:39 -0800
parents 915032dd35f4
children 5f9bc02e9caf
line source
1 import os
2 import sys
3 import re
4 import urllib2
5 import threading
6 import Queue
8 import html
9 import xml.etree.ElementTree
12 MAX_ITEMS = 30
13 MAX_LINK_Z = 4
14 MAX_THREADS = 20
17 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
19 def _parse_root_tag(root_tag):
20 re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
22 if re_match is None:
23 return (None, None)
24 else:
25 return re_match.group(2, 3)
28 def _go_rss(elementTree):
29 title = elementTree.find("channel/title").text.strip()
30 link = elementTree.find("channel/link").text
32 items = []
34 for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
35 it_title = i.find("title").text.strip()
36 it_link = i.find("link").text
38 items.append((it_title, it_link))
40 return (title, link, items)
43 def _go_atom(elementTree):
44 ns = "http://www.w3.org/2005/Atom"
46 title = elementTree.find("{%s}title" % ns).text.strip()
47 link = ''
49 for i in elementTree.findall("{%s}link" % ns):
50 if i.get("type") == "text/html" and i.get("rel") == "alternate":
51 link = i.get("href")
52 break
54 items = []
56 for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
57 it_title = i.find("{%s}title" % ns).text.strip()
58 it_link = ''
60 for j in i.findall("{%s}link" % ns):
61 if j.get("type") == "text/html" and j.get("rel") == "alternate":
62 it_link = j.get("href")
63 break
65 items.append((it_title, it_link))
67 return (title, link, items)
70 def _to_html(docstruct):
71 root = html.HTML()
73 header = root.header
74 header.title("myrss")
75 header.link(rel="stylesheet", type="text/css", href="index.css")
77 link_z = 0
79 for feed in docstruct:
80 if feed is None:
81 continue
83 (title, link, items) = feed
85 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
86 link_z += 1
87 p = root.p
89 for (i, (it_title, it_link)) in enumerate(items):
90 if i > 0:
91 p += " - "
93 p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
94 link_z += 1
96 return unicode(root).encode("utf-8")
99 def _process_url(url):
100 ret = None
102 try:
103 print >> sys.stderr, "--> processing %s" % url
104 feed = urllib2.urlopen(url)
105 except urllib2.HTTPError as e:
106 print >> sys.stderr, "--> (%s) %s" % (url, e)
107 return ret
109 elementTree = xml.etree.ElementTree.parse(feed)
110 root = elementTree.getroot()
112 parsed_root_tag = _parse_root_tag(root.tag)
114 if parsed_root_tag == (None, "rss"):
115 version = float(root.get("version", 0.0))
116 if version >= 2.0:
117 ret = _go_rss(elementTree)
118 else:
119 raise NotImplementedError("Unsupported rss version")
120 elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
121 ret = _go_atom(elementTree)
122 else:
123 raise NotImplementedError("Unknown root tag")
125 return ret
128 class WorkerThread(threading.Thread):
129 def __init__(self, *args, **kwargs):
130 self._input_queue = kwargs.pop("input_queue")
131 self._output_queue = kwargs.pop("output_queue")
132 threading.Thread.__init__(self, *args, **kwargs)
133 self.daemon = True
135 def run(self):
136 while True:
137 (idx, url) = self._input_queue.get()
138 docfeed = None
139 try:
140 docfeed = _process_url(url)
141 except Exception as e:
142 print >> sys.stderr, "--> (%s) exception: %s" % (url, e)
143 self._output_queue.put((idx, docfeed))
144 self._input_queue.task_done()
147 if __name__ == "__main__":
148 with open("FEEDS") as feeds_file:
149 feedlines = feeds_file.readlines()
151 docstruct = [None]*len(feedlines)
152 iq = Queue.Queue(feedlines)
153 oq = Queue.Queue(feedlines)
155 for _ in range(MAX_THREADS):
156 WorkerThread(input_queue=iq, output_queue=oq).start()
158 for (i, l) in enumerate(feedlines):
159 if l[0] != '#':
160 l = l.strip()
161 iq.put((i, l))
163 iq.join()
165 while True:
166 try:
167 (idx, docfeed) = oq.get_nowait()
168 docstruct[idx] = docfeed
169 except Queue.Empty:
170 break
172 print _to_html(docstruct)