comparison myrss/myrss_parser.py @ 41:5f9bc02e9caf

add datetimestamp and caching
author paulo
date Fri, 01 Feb 2013 01:26:07 -0800
parents 62464a0034d1
children a1456ecd25b9
comparison
equal deleted inserted replaced
1:492d8d321f43 2:226e506e9c82
2 import sys 2 import sys
3 import re 3 import re
4 import urllib2 4 import urllib2
5 import threading 5 import threading
6 import Queue 6 import Queue
7 import datetime
8 import time
7 9
8 import html 10 import html
9 import xml.etree.ElementTree 11 import xml.etree.ElementTree
10 12
11 13
14 FEEDS_FILE = "FEEDS"
15 CACHE_HTML_FILE = "__cache__.html"
16
17 #CACHE_LIFE = 1200 # [seconds]
18 CACHE_LIFE = 30 # [seconds]
12 MAX_ITEMS = 30 19 MAX_ITEMS = 30
13 MAX_LINK_Z = 4 20 MAX_LINK_Z = 4
14 MAX_THREADS = 20 21 MAX_THREADS = 20
15 22
16 23
65 items.append((it_title, it_link)) 72 items.append((it_title, it_link))
66 73
67 return (title, link, items) 74 return (title, link, items)
68 75
69 76
70 def _to_html(docstruct): 77 def _to_html(dtnow, docstruct):
78 datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
79 page_title = "myrss -- %s" % datetime_str
80
71 root = html.HTML() 81 root = html.HTML()
72 82
73 header = root.header 83 header = root.header
74 header.title("myrss") 84 header.title(page_title)
75 header.link(rel="stylesheet", type="text/css", href="index.css") 85 header.link(rel="stylesheet", type="text/css", href="index.css")
86
87 body = root.body
88 body.h1(page_title)
76 89
77 link_z = 0 90 link_z = 0
78 91
79 for feed in docstruct: 92 for feed in docstruct:
80 if feed is None: 93 if feed is None:
81 continue 94 continue
82 95
83 (title, link, items) = feed 96 (title, link, items) = feed
84 97
85 root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 98 body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
86 link_z += 1 99 link_z += 1
87 p = root.p 100 p = body.p
88 101
89 for (i, (it_title, it_link)) in enumerate(items): 102 for (i, (it_title, it_link)) in enumerate(items):
90 if i > 0: 103 if i > 0:
91 p += " - " 104 p += " - "
92 105
99 def _process_url(url): 112 def _process_url(url):
100 ret = None 113 ret = None
101 114
102 try: 115 try:
103 print >> sys.stderr, "--> processing %s" % url 116 print >> sys.stderr, "--> processing %s" % url
104 feed = urllib2.urlopen(url) 117 feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}))
105 except urllib2.HTTPError as e: 118 except urllib2.HTTPError as e:
106 print >> sys.stderr, "--> (%s) %s" % (url, e) 119 print >> sys.stderr, "--> (%s) %s" % (url, e)
107 return ret 120 return ret
108 121
109 elementTree = xml.etree.ElementTree.parse(feed) 122 elementTree = xml.etree.ElementTree.parse(feed)
142 print >> sys.stderr, "--> (%s) exception: %s" % (url, e) 155 print >> sys.stderr, "--> (%s) exception: %s" % (url, e)
143 self._output_queue.put((idx, docfeed)) 156 self._output_queue.put((idx, docfeed))
144 self._input_queue.task_done() 157 self._input_queue.task_done()
145 158
146 159
160 def main():
161 ret = ''
162
163 epoch_now = time.time()
164 dtnow = datetime.datetime.fromtimestamp(epoch_now)
165
166 if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
167 with open(CACHE_HTML_FILE) as cache_html_file:
168 ret = cache_html_file.read()
169
170 else:
171 with open(FEEDS_FILE) as feeds_file:
172 feedlines = feeds_file.readlines()
173
174 docstruct = [None]*len(feedlines)
175 iq = Queue.Queue(feedlines)
176 oq = Queue.Queue(feedlines)
177
178 for _ in range(MAX_THREADS):
179 WorkerThread(input_queue=iq, output_queue=oq).start()
180
181 for (i, l) in enumerate(feedlines):
182 if l[0] != '#':
183 l = l.strip()
184 iq.put((i, l))
185
186 iq.join()
187
188 while True:
189 try:
190 (idx, docfeed) = oq.get_nowait()
191 docstruct[idx] = docfeed
192 except Queue.Empty:
193 break
194
195 ret = _to_html(dtnow, docstruct)
196
197 with open(CACHE_HTML_FILE, 'w') as cache_html_file:
198 cache_html_file.write(ret)
199
200 return ret
201
202
147 if __name__ == "__main__": 203 if __name__ == "__main__":
148 with open("FEEDS") as feeds_file: 204 print main()
149 feedlines = feeds_file.readlines() 205
150
151 docstruct = [None]*len(feedlines)
152 iq = Queue.Queue(feedlines)
153 oq = Queue.Queue(feedlines)
154
155 for _ in range(MAX_THREADS):
156 WorkerThread(input_queue=iq, output_queue=oq).start()
157
158 for (i, l) in enumerate(feedlines):
159 if l[0] != '#':
160 l = l.strip()
161 iq.put((i, l))
162
163 iq.join()
164
165 while True:
166 try:
167 (idx, docfeed) = oq.get_nowait()
168 docstruct[idx] = docfeed
169 except Queue.Empty:
170 break
171
172 print _to_html(docstruct)