comparison myrss/myrss_parser.py @ 42:a1456ecd25b9

fix bug with threadpool; create WSGI app and test server; use logging
author paulo
date Mon, 04 Feb 2013 23:58:02 -0800
parents 5f9bc02e9caf
children
comparison
equal deleted inserted replaced
2:226e506e9c82 3:a5668d7bdf5b
4 import urllib2 4 import urllib2
5 import threading 5 import threading
6 import Queue 6 import Queue
7 import datetime 7 import datetime
8 import time 8 import time
9 import logging
10 logging.basicConfig(level=logging.INFO)
9 11
10 import html 12 import html
11 import xml.etree.ElementTree 13 import xml.etree.ElementTree
12 14
13 15
76 78
77 def _to_html(dtnow, docstruct): 79 def _to_html(dtnow, docstruct):
78 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z") 80 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
79 page_title = "myrss -- %s" % datetime_str 81 page_title = "myrss -- %s" % datetime_str
80 82
81 root = html.HTML() 83 root = html.HTML("html")
82 84
83 header = root.header 85 header = root.header
84 header.title(page_title) 86 header.title(page_title)
85 header.link(rel="stylesheet", type="text/css", href="index.css") 87 header.link(rel="stylesheet", type="text/css", href="index.css")
86 88
111 113
112 def _process_url(url): 114 def _process_url(url):
113 ret = None 115 ret = None
114 116
115 try: 117 try:
116 print >> sys.stderr, "--> processing %s" % url 118 logging.info("processing %s" % url)
117 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''})) 119 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}))
118 except urllib2.HTTPError as e: 120 except urllib2.HTTPError as e:
119 print >> sys.stderr, "--> (%s) %s" % (url, e) 121 logging.info("(%s) %s" % (url, e))
120 return ret 122 return ret
121 123
122 elementTree = xml.etree.ElementTree.parse(feed) 124 elementTree = xml.etree.ElementTree.parse(feed)
123 root = elementTree.getroot() 125 root = elementTree.getroot()
124 126
150 (idx, url) = self._input_queue.get() 152 (idx, url) = self._input_queue.get()
151 docfeed = None 153 docfeed = None
152 try: 154 try:
153 docfeed = _process_url(url) 155 docfeed = _process_url(url)
154 except Exception as e: 156 except Exception as e:
155 print >> sys.stderr, "--> (%s) exception: %s" % (url, e) 157 logging.info("(%s) exception: %s" % (url, e))
156 self._output_queue.put((idx, docfeed)) 158 self._output_queue.put((idx, docfeed))
157 self._input_queue.task_done()
158 159
159 160
160 def main(): 161 def main(input_queue, output_queue):
161 ret = '' 162 ret = ''
162 163
163 epoch_now = time.time() 164 epoch_now = time.time()
164 dtnow = datetime.datetime.fromtimestamp(epoch_now) 165 dtnow = datetime.datetime.fromtimestamp(epoch_now)
165 166
170 else: 171 else:
171 with open(FEEDS_FILE) as feeds_file: 172 with open(FEEDS_FILE) as feeds_file:
172 feedlines = feeds_file.readlines() 173 feedlines = feeds_file.readlines()
173 174
174 docstruct = [None]*len(feedlines) 175 docstruct = [None]*len(feedlines)
175 iq = Queue.Queue(feedlines) 176 num_input = 0
176 oq = Queue.Queue(feedlines)
177
178 for _ in range(MAX_THREADS):
179 WorkerThread(input_queue=iq, output_queue=oq).start()
180
181 for (i, l) in enumerate(feedlines): 177 for (i, l) in enumerate(feedlines):
182 if l[0] != '#': 178 if l[0] != '#':
183 l = l.strip() 179 l = l.strip()
184 iq.put((i, l)) 180 input_queue.put((i, l))
185 181 num_input += 1
186 iq.join() 182
187 183 for _ in range(num_input):
188 while True: 184 (idx, docfeed) = output_queue.get()
189 try: 185 docstruct[idx] = docfeed
190 (idx, docfeed) = oq.get_nowait()
191 docstruct[idx] = docfeed
192 except Queue.Empty:
193 break
194 186
195 ret = _to_html(dtnow, docstruct) 187 ret = _to_html(dtnow, docstruct)
196 188
197 with open(CACHE_HTML_FILE, 'w') as cache_html_file: 189 with open(CACHE_HTML_FILE, 'w') as cache_html_file:
198 cache_html_file.write(ret) 190 cache_html_file.write(ret)
199 191
200 return ret 192 return ret
201 193
202 194
203 if __name__ == "__main__": 195 class MyRssApp:
204 print main() 196 def __init__(self):
205 197 self._iq = Queue.Queue(MAX_THREADS)
198 self._oq = Queue.Queue(MAX_THREADS)
199
200 for _ in range(MAX_THREADS):
201 WorkerThread(input_queue=self._iq, output_queue=self._oq).start()
202
203 def __call__(self, environ, start_response):
204 response_body = main(self._iq, self._oq)
205 response_headers = [
206 ("Content-Type", "text/html"),
207 ("Content-Length", str(len(response_body))),
208 ]
209 start_response("200 OK", response_headers)
210
211 return [response_body]
212
213