Commit 0a954c9f authored by Josh Roesslein's avatar Josh Roesslein
Browse files

Fix parsing of search result 'source' attribute. Properly unescape html.

parent a5c206db
......@@ -13,6 +13,7 @@ during upgrade will be listed here.
+ API.friends_ids and API.followers_ids now return a list of integers.
Parser updated to handle cursor responses. See above.
+ Fix Status.source_url parsing
+ Fix search result 'source' parsing to properly unescape html and extract source
+ Cursor
Added the Cursor object to help with pagination within the API.
Please see the pagination tutorial for more details.
......
......@@ -2,6 +2,8 @@
# Copyright 2009 Joshua Roesslein
# See LICENSE
import htmlentitydefs
import re
from datetime import datetime
from . models import models
......@@ -40,6 +42,28 @@ def _parse_search_datetime(str):
return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000')
def unescape_html(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def _parse_html_value(html):
return html[html.find('>')+1:html.rfind('<')]
......@@ -207,6 +231,8 @@ def _parse_search_result(obj, api):
for k, v in obj.items():
if k == 'created_at':
setattr(result, k, _parse_search_datetime(v))
elif k == 'source':
setattr(result, k, _parse_html_value(unescape_html(v)))
else:
setattr(result, k, v)
return result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment