Skip to content

Commit fc8459b

Browse files
committed
Merge pull request #1 from jkehler/bugfix-encoding
BugFix: html pages containing <?xml version="1.0" encoding="utf-8"?> declarations
2 parents 55c88c1 + f1df650 commit fc8459b

1 file changed

Lines changed: 6 additions & 0 deletions

File tree

goose/parsers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from copy import deepcopy
2727
from goose.text import innerTrim
2828
from goose.text import encodeValue
29+
import re
2930

3031

3132
class Parser(object):
@@ -51,7 +52,12 @@ def css_select(self, node, selector):
5152
@classmethod
5253
def fromstring(self, html):
5354
html = encodeValue(html)
55+
56+
# remove <?xml> tag because it breaks the lxml html parser
57+
html = re.sub(r'<\?xml version\=[\"\'][0-9]\.[0-9][\"\'] encoding\=(.*?)\?>', '', html)
58+
5459
self.doc = lxml.html.fromstring(html)
60+
5561
return self.doc
5662

5763
@classmethod

0 commit comments

Comments
 (0)