comparison myrss/myrss_parser.py @ 40:62464a0034d1

add threaded url opener
author paulo
date Thu, 31 Jan 2013 02:19:39 -0800
parents 915032dd35f4
children 5f9bc02e9caf
comparison
equal deleted inserted replaced
0:9aec8c0eb6b1 1:492d8d321f43
1 import os 1 import os
2 import sys
2 import re 3 import re
4 import urllib2
5 import threading
6 import Queue
3 7
4 import html 8 import html
5 import xml.etree.ElementTree 9 import xml.etree.ElementTree
6 10
7 11
8 MAX_ITEMS = 30 12 MAX_ITEMS = 30
9 MAX_LINK_Z = 4 13 MAX_LINK_Z = 4
14 MAX_THREADS = 20
10 15
11 16
12 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 17 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
13 18
14 def _parse_root_tag(root_tag): 19 def _parse_root_tag(root_tag):
70 header.link(rel="stylesheet", type="text/css", href="index.css") 75 header.link(rel="stylesheet", type="text/css", href="index.css")
71 76
72 link_z = 0 77 link_z = 0
73 78
74 for feed in docstruct: 79 for feed in docstruct:
80 if feed is None:
81 continue
82
75 (title, link, items) = feed 83 (title, link, items) = feed
76 84
77 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 85 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
78 link_z += 1 86 link_z += 1
79 p = root.p 87 p = root.p
86 link_z += 1 94 link_z += 1
87 95
88 return unicode(root).encode("utf-8") 96 return unicode(root).encode("utf-8")
89 97
90 98
99 def _process_url(url):
100 ret = None
101
102 try:
103 print >> sys.stderr, "--> processing %s" % url
104 feed = urllib2.urlopen(url)
105 except urllib2.HTTPError as e:
106 print >> sys.stderr, "--> (%s) %s" % (url, e)
107 return ret
108
109 elementTree = xml.etree.ElementTree.parse(feed)
110 root = elementTree.getroot()
111
112 parsed_root_tag = _parse_root_tag(root.tag)
113
114 if parsed_root_tag == (None, "rss"):
115 version = float(root.get("version", 0.0))
116 if version >= 2.0:
117 ret = _go_rss(elementTree)
118 else:
119 raise NotImplementedError("Unsupported rss version")
120 elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"):
121 ret = _go_atom(elementTree)
122 else:
123 raise NotImplementedError("Unknown root tag")
124
125 return ret
126
127
128 class WorkerThread(threading.Thread):
129 def __init__(self, *args, **kwargs):
130 self._input_queue = kwargs.pop("input_queue")
131 self._output_queue = kwargs.pop("output_queue")
132 threading.Thread.__init__(self, *args, **kwargs)
133 self.daemon = True
134
135 def run(self):
136 while True:
137 (idx, url) = self._input_queue.get()
138 docfeed = None
139 try:
140 docfeed = _process_url(url)
141 except Exception as e:
142 print >> sys.stderr, "--> (%s) exception: %s" % (url, e)
143 self._output_queue.put((idx, docfeed))
144 self._input_queue.task_done()
145
146
91 if __name__ == "__main__": 147 if __name__ == "__main__":
148 with open("FEEDS") as feeds_file:
149 feedlines = feeds_file.readlines()
92 150
93 docstruct = [] 151 docstruct = [None]*len(feedlines)
94 XMLFILE = "%d.feedtmp" 152 iq = Queue.Queue(feedlines)
95 153 oq = Queue.Queue(feedlines)
96 for i in range(31):
97 if os.path.exists(XMLFILE % i):
98 elementTree = xml.etree.ElementTree.parse(XMLFILE % i)
99 root = elementTree.getroot()
100 154
101 if _parse_root_tag(root.tag) == (None, "rss"): 155 for _ in range(MAX_THREADS):
102 version = float(root.get("version", 0.0)) 156 WorkerThread(input_queue=iq, output_queue=oq).start()
103 if version >= 2.0:
104 docstruct.append(_go_rss(elementTree))
105 else:
106 raise NotImplementedError("Unsupported rss version")
107 elif _parse_root_tag(root.tag) == ("http://www.w3.org/2005/Atom", "feed"):
108 docstruct.append(_go_atom(elementTree))
109 else:
110 raise NotImplementedError("Unknown root tag")
111 157
112 if len(docstruct) > 0: 158 for (i, l) in enumerate(feedlines):
113 print _to_html(docstruct) 159 if l[0] != '#':
114 else: 160 l = l.strip()
115 raise RuntimeError("Could not produce docstruct") 161 iq.put((i, l))
162
163 iq.join()
164
165 while True:
166 try:
167 (idx, docfeed) = oq.get_nowait()
168 docstruct[idx] = docfeed
169 except Queue.Empty:
170 break
171
172 print _to_html(docstruct)