diff myrss/myrss_parser.py @ 41:5f9bc02e9caf

add datetimestamp and caching
author paulo
date Fri, 01 Feb 2013 01:26:07 -0800
parents 62464a0034d1
children a1456ecd25b9
line diff
     1.1 --- a/myrss/myrss_parser.py	Thu Jan 31 02:19:39 2013 -0800
     1.2 +++ b/myrss/myrss_parser.py	Fri Feb 01 01:26:07 2013 -0800
     1.3 @@ -4,11 +4,18 @@
     1.4  import urllib2
     1.5  import threading
     1.6  import Queue
     1.7 +import datetime
     1.8 +import time
     1.9  
    1.10  import html
    1.11  import xml.etree.ElementTree 
    1.12  
    1.13  
    1.14 +FEEDS_FILE = "FEEDS"
    1.15 +CACHE_HTML_FILE = "__cache__.html"
    1.16 +
    1.17 +#CACHE_LIFE = 1200 # [seconds]
    1.18 +CACHE_LIFE = 30 # [seconds]
    1.19  MAX_ITEMS = 30
    1.20  MAX_LINK_Z = 4
    1.21  MAX_THREADS = 20
    1.22 @@ -67,13 +74,19 @@
    1.23  	return (title, link, items)
    1.24  
    1.25  
    1.26 -def _to_html(docstruct):
    1.27 +def _to_html(dtnow, docstruct):
    1.28 +	datetime_str = dtnow.strftime("%Y-%m-%d %H:%M %Z")
    1.29 +	page_title = "myrss -- %s" % datetime_str
    1.30 +
    1.31  	root = html.HTML()
    1.32  
    1.33  	header = root.header
    1.34 -	header.title("myrss")
    1.35 +	header.title(page_title)
    1.36  	header.link(rel="stylesheet", type="text/css", href="index.css")
    1.37  
    1.38 +	body = root.body
    1.39 +	body.h1(page_title)
    1.40 +
    1.41  	link_z = 0
    1.42  
    1.43  	for feed in docstruct:
    1.44 @@ -82,9 +95,9 @@
    1.45  
    1.46  		(title, link, items) = feed
    1.47  
    1.48 -		root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.49 +		body.h2.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.50  		link_z += 1
    1.51 -		p = root.p
    1.52 +		p = body.p
    1.53  
    1.54  		for (i, (it_title, it_link)) in enumerate(items):
    1.55  			if i > 0:
    1.56 @@ -101,7 +114,7 @@
    1.57  
    1.58  	try:
    1.59  		print >> sys.stderr, "--> processing %s" % url
    1.60 -		feed = urllib2.urlopen(url)
    1.61 +		feed = urllib2.urlopen(urllib2.Request(url, headers={"User-Agent": ''}))
    1.62  	except urllib2.HTTPError as e:
    1.63  		print >> sys.stderr, "--> (%s) %s" % (url, e)
    1.64  		return ret
    1.65 @@ -144,29 +157,49 @@
    1.66  			self._input_queue.task_done()
    1.67  			
    1.68  
    1.69 +def main():
    1.70 +	ret = ''
    1.71 +
    1.72 +	epoch_now = time.time()
    1.73 +	dtnow = datetime.datetime.fromtimestamp(epoch_now)
    1.74 +
    1.75 +	if os.path.exists(CACHE_HTML_FILE) and (epoch_now - os.stat(CACHE_HTML_FILE).st_mtime) < float(CACHE_LIFE):
    1.76 +		with open(CACHE_HTML_FILE) as cache_html_file:
    1.77 +			ret = cache_html_file.read()
    1.78 +
    1.79 +	else:
    1.80 +		with open(FEEDS_FILE) as feeds_file:
    1.81 +			feedlines = feeds_file.readlines()
    1.82 +
    1.83 +		docstruct = [None]*len(feedlines)
    1.84 +		iq = Queue.Queue(feedlines)
    1.85 +		oq = Queue.Queue(feedlines)
    1.86 +
    1.87 +		for _ in range(MAX_THREADS):
    1.88 +			WorkerThread(input_queue=iq, output_queue=oq).start()
    1.89 +
    1.90 +		for (i, l) in enumerate(feedlines):
    1.91 +			if l[0] != '#':
    1.92 +				l = l.strip()
    1.93 +				iq.put((i, l))
    1.94 +
    1.95 +		iq.join()
    1.96 +
    1.97 +		while True:
    1.98 +			try:
    1.99 +				(idx, docfeed) = oq.get_nowait()
   1.100 +				docstruct[idx] = docfeed
   1.101 +			except Queue.Empty:
   1.102 +				break
   1.103 +
   1.104 +		ret = _to_html(dtnow, docstruct)
   1.105 +
   1.106 +		with open(CACHE_HTML_FILE, 'w') as cache_html_file:
   1.107 +			cache_html_file.write(ret)
   1.108 +
   1.109 +	return ret
   1.110 +
   1.111 +
   1.112  if __name__ == "__main__":
   1.113 -	with open("FEEDS") as feeds_file:
   1.114 -		feedlines = feeds_file.readlines()
   1.115 +	print main()
   1.116  
   1.117 -	docstruct = [None]*len(feedlines)
   1.118 -	iq = Queue.Queue(feedlines)
   1.119 -	oq = Queue.Queue(feedlines)
   1.120 -
   1.121 -	for _ in range(MAX_THREADS):
   1.122 -		WorkerThread(input_queue=iq, output_queue=oq).start()
   1.123 -
   1.124 -	for (i, l) in enumerate(feedlines):
   1.125 -		if l[0] != '#':
   1.126 -			l = l.strip()
   1.127 -			iq.put((i, l))
   1.128 -
   1.129 -	iq.join()
   1.130 -
   1.131 -	while True:
   1.132 -		try:
   1.133 -			(idx, docfeed) = oq.get_nowait()
   1.134 -			docstruct[idx] = docfeed
   1.135 -		except Queue.Empty:
   1.136 -			break
   1.137 -
   1.138 -	print _to_html(docstruct)