Mercurial > hg > index.fcgi > www > www-1
comparison myrss/myrss_parser.py @ 40:62464a0034d1
add threaded url opener
author | paulo |
---|---|
date | Thu, 31 Jan 2013 02:19:39 -0800 |
parents | 915032dd35f4 |
children | 5f9bc02e9caf |
comparison
equal
deleted
inserted
replaced
0:9aec8c0eb6b1 | 1:492d8d321f43 |
---|---|
1 import os | 1 import os |
2 import sys | |
2 import re | 3 import re |
4 import urllib2 | |
5 import threading | |
6 import Queue | |
3 | 7 |
4 import html | 8 import html |
5 import xml.etree.ElementTree | 9 import xml.etree.ElementTree |
6 | 10 |
7 | 11 |
8 MAX_ITEMS = 30 | 12 MAX_ITEMS = 30 |
9 MAX_LINK_Z = 4 | 13 MAX_LINK_Z = 4 |
14 MAX_THREADS = 20 | |
10 | 15 |
11 | 16 |
12 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") | 17 _PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") |
13 | 18 |
14 def _parse_root_tag(root_tag): | 19 def _parse_root_tag(root_tag): |
70 header.link(rel="stylesheet", type="text/css", href="index.css") | 75 header.link(rel="stylesheet", type="text/css", href="index.css") |
71 | 76 |
72 link_z = 0 | 77 link_z = 0 |
73 | 78 |
74 for feed in docstruct: | 79 for feed in docstruct: |
80 if feed is None: | |
81 continue | |
82 | |
75 (title, link, items) = feed | 83 (title, link, items) = feed |
76 | 84 |
77 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) | 85 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) |
78 link_z += 1 | 86 link_z += 1 |
79 p = root.p | 87 p = root.p |
86 link_z += 1 | 94 link_z += 1 |
87 | 95 |
88 return unicode(root).encode("utf-8") | 96 return unicode(root).encode("utf-8") |
89 | 97 |
90 | 98 |
99 def _process_url(url): | |
100 ret = None | |
101 | |
102 try: | |
103 print >> sys.stderr, "--> processing %s" % url | |
104 feed = urllib2.urlopen(url) | |
105 except urllib2.HTTPError as e: | |
106 print >> sys.stderr, "--> (%s) %s" % (url, e) | |
107 return ret | |
108 | |
109 elementTree = xml.etree.ElementTree.parse(feed) | |
110 root = elementTree.getroot() | |
111 | |
112 parsed_root_tag = _parse_root_tag(root.tag) | |
113 | |
114 if parsed_root_tag == (None, "rss"): | |
115 version = float(root.get("version", 0.0)) | |
116 if version >= 2.0: | |
117 ret = _go_rss(elementTree) | |
118 else: | |
119 raise NotImplementedError("Unsupported rss version") | |
120 elif parsed_root_tag == ("http://www.w3.org/2005/Atom", "feed"): | |
121 ret = _go_atom(elementTree) | |
122 else: | |
123 raise NotImplementedError("Unknown root tag") | |
124 | |
125 return ret | |
126 | |
127 | |
128 class WorkerThread(threading.Thread): | |
129 def __init__(self, *args, **kwargs): | |
130 self._input_queue = kwargs.pop("input_queue") | |
131 self._output_queue = kwargs.pop("output_queue") | |
132 threading.Thread.__init__(self, *args, **kwargs) | |
133 self.daemon = True | |
134 | |
135 def run(self): | |
136 while True: | |
137 (idx, url) = self._input_queue.get() | |
138 docfeed = None | |
139 try: | |
140 docfeed = _process_url(url) | |
141 except Exception as e: | |
142 print >> sys.stderr, "--> (%s) exception: %s" % (url, e) | |
143 self._output_queue.put((idx, docfeed)) | |
144 self._input_queue.task_done() | |
145 | |
146 | |
91 if __name__ == "__main__": | 147 if __name__ == "__main__": |
148 with open("FEEDS") as feeds_file: | |
149 feedlines = feeds_file.readlines() | |
92 | 150 |
93 docstruct = [] | 151 docstruct = [None]*len(feedlines) |
94 XMLFILE = "%d.feedtmp" | 152 iq = Queue.Queue(feedlines) |
95 | 153 oq = Queue.Queue(feedlines) |
96 for i in range(31): | |
97 if os.path.exists(XMLFILE % i): | |
98 elementTree = xml.etree.ElementTree.parse(XMLFILE % i) | |
99 root = elementTree.getroot() | |
100 | 154 |
101 if _parse_root_tag(root.tag) == (None, "rss"): | 155 for _ in range(MAX_THREADS): |
102 version = float(root.get("version", 0.0)) | 156 WorkerThread(input_queue=iq, output_queue=oq).start() |
103 if version >= 2.0: | |
104 docstruct.append(_go_rss(elementTree)) | |
105 else: | |
106 raise NotImplementedError("Unsupported rss version") | |
107 elif _parse_root_tag(root.tag) == ("http://www.w3.org/2005/Atom", "feed"): | |
108 docstruct.append(_go_atom(elementTree)) | |
109 else: | |
110 raise NotImplementedError("Unknown root tag") | |
111 | 157 |
112 if len(docstruct) > 0: | 158 for (i, l) in enumerate(feedlines): |
113 print _to_html(docstruct) | 159 if l[0] != '#': |
114 else: | 160 l = l.strip() |
115 raise RuntimeError("Could not produce docstruct") | 161 iq.put((i, l)) |
162 | |
163 iq.join() | |
164 | |
165 while True: | |
166 try: | |
167 (idx, docfeed) = oq.get_nowait() | |
168 docstruct[idx] = docfeed | |
169 except Queue.Empty: | |
170 break | |
171 | |
172 print _to_html(docstruct) |