Mercurial > hg > index.fcgi > www > www-1
diff myrss/myrss_parser.py @ 39:915032dd35f4
add myrss2
author | paulo |
---|---|
date | Wed, 30 Jan 2013 02:32:22 -0800 |
parents | |
children | 62464a0034d1 |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/myrss/myrss_parser.py Wed Jan 30 02:32:22 2013 -0800 1.3 @@ -0,0 +1,115 @@ 1.4 +import os 1.5 +import re 1.6 + 1.7 +import html 1.8 +import xml.etree.ElementTree 1.9 + 1.10 + 1.11 +MAX_ITEMS = 30 1.12 +MAX_LINK_Z = 4 1.13 + 1.14 + 1.15 +_PARSE_ROOT_TAG_RE = re.compile(r"(\{(.+)\})?(.+)") 1.16 + 1.17 +def _parse_root_tag(root_tag): 1.18 + re_match = _PARSE_ROOT_TAG_RE.match(root_tag) 1.19 + 1.20 + if re_match is None: 1.21 + return (None, None) 1.22 + else: 1.23 + return re_match.group(2, 3) 1.24 + 1.25 + 1.26 +def _go_rss(elementTree): 1.27 + title = elementTree.find("channel/title").text.strip() 1.28 + link = elementTree.find("channel/link").text 1.29 + 1.30 + items = [] 1.31 + 1.32 + for i in elementTree.findall("channel/item")[:MAX_ITEMS]: 1.33 + it_title = i.find("title").text.strip() 1.34 + it_link = i.find("link").text 1.35 + 1.36 + items.append((it_title, it_link)) 1.37 + 1.38 + return (title, link, items) 1.39 + 1.40 + 1.41 +def _go_atom(elementTree): 1.42 + ns = "http://www.w3.org/2005/Atom" 1.43 + 1.44 + title = elementTree.find("{%s}title" % ns).text.strip() 1.45 + link = '' 1.46 + 1.47 + for i in elementTree.findall("{%s}link" % ns): 1.48 + if i.get("type") == "text/html" and i.get("rel") == "alternate": 1.49 + link = i.get("href") 1.50 + break 1.51 + 1.52 + items = [] 1.53 + 1.54 + for i in elementTree.findall("{%s}entry" % ns)[:MAX_ITEMS]: 1.55 + it_title = i.find("{%s}title" % ns).text.strip() 1.56 + it_link = '' 1.57 + 1.58 + for j in i.findall("{%s}link" % ns): 1.59 + if j.get("type") == "text/html" and j.get("rel") == "alternate": 1.60 + it_link = j.get("href") 1.61 + break 1.62 + 1.63 + items.append((it_title, it_link)) 1.64 + 1.65 + return (title, link, items) 1.66 + 1.67 + 1.68 +def _to_html(docstruct): 1.69 + root = html.HTML() 1.70 + 1.71 + header = root.header 1.72 + header.title("myrss") 1.73 + header.link(rel="stylesheet", type="text/css", href="index.css") 1.74 + 1.75 + link_z = 0 1.76 + 1.77 + for feed in docstruct: 1.78 + (title, link, items) = feed 1.79 + 1.80 + root.h1.a(title, href=link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.81 + link_z += 1 1.82 + p = root.p 1.83 + 1.84 + for (i, (it_title, it_link)) in enumerate(items): 1.85 + if i > 0: 1.86 + p += " - " 1.87 + 1.88 + p.a(it_title, href=it_link, klass="z%d" % (link_z % MAX_LINK_Z)) 1.89 + link_z += 1 1.90 + 1.91 + return unicode(root).encode("utf-8") 1.92 + 1.93 + 1.94 +if __name__ == "__main__": 1.95 + 1.96 + docstruct = [] 1.97 + XMLFILE = "%d.feedtmp" 1.98 + 1.99 + for i in range(31): 1.100 + if os.path.exists(XMLFILE % i): 1.101 + elementTree = xml.etree.ElementTree.parse(XMLFILE % i) 1.102 + root = elementTree.getroot() 1.103 + 1.104 + if _parse_root_tag(root.tag) == (None, "rss"): 1.105 + version = float(root.get("version", 0.0)) 1.106 + if version >= 2.0: 1.107 + docstruct.append(_go_rss(elementTree)) 1.108 + else: 1.109 + raise NotImplementedError("Unsupported rss version") 1.110 + elif _parse_root_tag(root.tag) == ("http://www.w3.org/2005/Atom", "feed"): 1.111 + docstruct.append(_go_atom(elementTree)) 1.112 + else: 1.113 + raise NotImplementedError("Unknown root tag") 1.114 + 1.115 + if len(docstruct) > 0: 1.116 + print _to_html(docstruct) 1.117 + else: 1.118 + raise RuntimeError("Could not produce docstruct")