diff myrss/myrss_parser.py @ 39:915032dd35f4

add myrss2
author paulo
date Wed, 30 Jan 2013 02:32:22 -0800
parents
children 62464a0034d1
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/myrss/myrss_parser.py	Wed Jan 30 02:32:22 2013 -0800
     1.3 @@ -0,0 +1,115 @@
     1.4 +import os
     1.5 +import re
     1.6 +
     1.7 +import html
     1.8 +import xml.etree.ElementTree 
     1.9 +
    1.10 +
    1.11 +MAX_ITEMS = 30
    1.12 +MAX_LINK_Z = 4
    1.13 +
    1.14 +
    1.15 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)")
    1.16 +
    1.17 +def _parse_root_tag(root_tag):
    1.18 +	re_match = _PARSE_ROOT_TAG_RE.match(root_tag)
    1.19 +
    1.20 +	if re_match is None:
    1.21 +		return (None, None)
    1.22 +	else:
    1.23 +		return re_match.group(2, 3)
    1.24 +	
    1.25 +
    1.26 +def _go_rss(elementTree):
    1.27 +	title = elementTree.find("channel/title").text.strip()
    1.28 +	link = elementTree.find("channel/link").text
    1.29 +
    1.30 +	items = []
    1.31 +
    1.32 +	for i in elementTree.findall("channel/item")[:MAX_ITEMS]:
    1.33 +		it_title = i.find("title").text.strip()
    1.34 +		it_link = i.find("link").text
    1.35 +
    1.36 +		items.append((it_title, it_link))
    1.37 +
    1.38 +	return (title, link, items)
    1.39 +
    1.40 +
    1.41 +def _go_atom(elementTree):
    1.42 +	ns = "http://www.w3.org/2005/Atom"
    1.43 +
    1.44 +	title = elementTree.find("{%s}title" % ns).text.strip()
    1.45 +	link = ''
    1.46 +
    1.47 +	for i in elementTree.findall("{%s}link" % ns):
    1.48 +		if i.get("type") == "text/html" and i.get("rel") == "alternate":
    1.49 +			link = i.get("href")
    1.50 +			break
    1.51 +
    1.52 +	items = []
    1.53 +
    1.54 +	for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]:
    1.55 +		it_title = i.find("{%s}title" % ns).text.strip()
    1.56 +		it_link = ''
    1.57 +		
    1.58 +		for j in i.findall("{%s}link" % ns):
    1.59 +			if j.get("type") == "text/html" and j.get("rel") == "alternate":
    1.60 +				it_link = j.get("href")
    1.61 +				break
    1.62 +
    1.63 +		items.append((it_title, it_link))
    1.64 +
    1.65 +	return (title, link, items)
    1.66 +
    1.67 +
    1.68 +def _to_html(docstruct):
    1.69 +	root = html.HTML()
    1.70 +
    1.71 +	header = root.header
    1.72 +	header.title("myrss")
    1.73 +	header.link(rel="stylesheet", type="text/css", href="index.css")
    1.74 +
    1.75 +	link_z = 0
    1.76 +
    1.77 +	for feed in docstruct:
    1.78 +		(title, link, items) = feed
    1.79 +
    1.80 +		root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.81 +		link_z += 1
    1.82 +		p = root.p
    1.83 +
    1.84 +		for (i, (it_title, it_link)) in enumerate(items):
    1.85 +			if i > 0:
    1.86 +				p += " - "
    1.87 +
    1.88 +			p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z))
    1.89 +			link_z += 1
    1.90 +
    1.91 +	return unicode(root).encode("utf-8")
    1.92 +
    1.93 +
    1.94 +if __name__ == "__main__":
    1.95 +
    1.96 +	docstruct = []
    1.97 +	XMLFILE = "%d.feedtmp"
    1.98 +	
    1.99 +	for i in range(31):
   1.100 +		if os.path.exists(XMLFILE % i):
   1.101 +			elementTree = xml.etree.ElementTree.parse(XMLFILE % i)
   1.102 +			root = elementTree.getroot()
   1.103 +
   1.104 +			if _parse_root_tag(root.tag) == (None, "rss"):
   1.105 +				version = float(root.get("version", 0.0))
   1.106 +				if version >= 2.0:
   1.107 +					docstruct.append(_go_rss(elementTree))
   1.108 +				else:
   1.109 +					raise NotImplementedError("Unsupported rss version")
   1.110 +			elif _parse_root_tag(root.tag) == ("http://www.w3.org/2005/Atom", "feed"):
   1.111 +				docstruct.append(_go_atom(elementTree))
   1.112 +			else:
   1.113 +				raise NotImplementedError("Unknown root tag")
   1.114 +
   1.115 +	if len(docstruct) > 0:
   1.116 +		print _to_html(docstruct)
   1.117 +	else:
   1.118 +		raise RuntimeError("Could not produce docstruct")