Commit 0a954c9f authored by Josh Roesslein's avatar Josh Roesslein
Browse files

Fix parsing of search result 'source' attribute. Properly unescape html.

parent a5c206db
...@@ -13,6 +13,7 @@ during upgrade will be listed here. ...@@ -13,6 +13,7 @@ during upgrade will be listed here.
+ API.friends_ids and API.followers_ids now return a list of integers. + API.friends_ids and API.followers_ids now return a list of integers.
Parser updated to handle cursor responses. See above. Parser updated to handle cursor responses. See above.
+ Fix Status.source_url parsing + Fix Status.source_url parsing
+ Fix search result 'source' parsing to properly unescape html and extract source
+ Cursor + Cursor
Added the Cursor object to help with pagination within the API. Added the Cursor object to help with pagination within the API.
Please see the pagination tutorial for more details. Please see the pagination tutorial for more details.
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
# Copyright 2009 Joshua Roesslein # Copyright 2009 Joshua Roesslein
# See LICENSE # See LICENSE
import htmlentitydefs
import re
from datetime import datetime from datetime import datetime
from . models import models from . models import models
...@@ -40,6 +42,28 @@ def _parse_search_datetime(str): ...@@ -40,6 +42,28 @@ def _parse_search_datetime(str):
return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000') return datetime.strptime(str, '%a, %d %b %Y %H:%M:%S +0000')
def unescape_html(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
def _parse_html_value(html): def _parse_html_value(html):
return html[html.find('>')+1:html.rfind('<')] return html[html.find('>')+1:html.rfind('<')]
...@@ -207,6 +231,8 @@ def _parse_search_result(obj, api): ...@@ -207,6 +231,8 @@ def _parse_search_result(obj, api):
for k, v in obj.items(): for k, v in obj.items():
if k == 'created_at': if k == 'created_at':
setattr(result, k, _parse_search_datetime(v)) setattr(result, k, _parse_search_datetime(v))
elif k == 'source':
setattr(result, k, _parse_html_value(unescape_html(v)))
else: else:
setattr(result, k, v) setattr(result, k, v)
return result return result
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment