You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Parsing page response
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 160, in lxml
self._lxml = soup_parse(self.html, features='html.parser')
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 33, in fromstring
return _parse(data, beautifulsoup, makeelement, **bsargs)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 79, in _parse
root = _convert_tree(tree, makeelement)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 152, in _convert_tree
res_root = convert_node(html_root)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 216, in convert_node
return handler(bs_node, parent)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
[Previous line repeated 7 more times]
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 242, in convert_tag
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
File "src/lxml/etree.pyx", line 3136, in lxml.etree.SubElement
File "src/lxml/apihelpers.pxi", line 199, in lxml.etree._makeSubElement
File "src/lxml/apihelpers.pxi", line 194, in lxml.etree._makeSubElement
File "src/lxml/apihelpers.pxi", line 323, in lxml.etree._initNodeAttributes
File "src/lxml/apihelpers.pxi", line 334, in lxml.etree._addAttributeToNode
File "src/lxml/apihelpers.pxi", line 1540, in lxml.etree._utf8
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "facebook-scraper", line 8, in
sys.exit(run())
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/main.py", line 33, in run
write_posts_to_csv(**kwargs, filename=args.filename, pages=args.pages, encoding=args.encoding)
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/init.py", line 86, in write_posts_to_csv
list_of_posts = list(get_posts(account=account, group=group, **kwargs))
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/facebook_scraper.py", line 95, in _generic_get_posts
for i, page in zip(counter, iter_pages_fn()):
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/page_iterators.py", line 35, in generic_iter_pages
page = parser.get_page()
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/page_iterators.py", line 67, in get_page
raw_posts = raw_page.find('article')
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 212, in find
for found in self.pq(selector)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 149, in pq
self._pq = PyQuery(self.lxml)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 162, in lxml
self._lxml = lxml.html.fromstring(self.raw_html)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 97, in raw_html
return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding)
TypeError: encode() argument 1 must be str, not None
The text was updated successfully, but these errors were encountered:
I am NOT a Python coder. I've decades experience in other languages though.
My initial guess is that this is related to make_html_element in utils.py. There, we use PyQuery(html). PyQuery, in turn, is a wrapper around lxml, which is giving the error above. In my particular case, I am fetching a page archive to read through as opposed to archive perfectly, so I'm thinking that the slaughtering of remove_control_characters from html5lib/html5lib-python#96 will be Good Enough for me. I'm trying this now:
def make_html_element(html: str, url=DEFAULT_URL) -> Element:
html = remove_control_characters(html)
pq_element = PyQuery(html)[0] # PyQuery is a list, so we take the first element
...
Thanks for the report, for some reason that FB response has the backspace control character indeed. I'll consider adding the remove control characters function call.
Uh oh!
There was an error while loading. Please reload this page.
When parsing the page response of https://m.facebook.com/page_content_list_view/more/?page_id=139375155037&start_cursor=%7B%22timeline_cursor%22%3A%22AQHR5-9azGSrK8QLl5A4EuPrQ76_dL9ls5NudeSqcSEBqUb61-oqQmEjzkv9mVrrEouA7fmKeiBA6W0y06x6MKv42cUEGP3NjD-2eTeCiABuFW3qDy61dS18DT-r-cHQO1Ov%22%2C%22timeline_section_cursor%22%3Anull%2C%22has_next_page%22%3Atrue%7D&num_to_fetch=4&surface_type=posts_tab
the script dies with:
Parsing page response
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 160, in lxml
self._lxml = soup_parse(self.html, features='html.parser')
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 33, in fromstring
return _parse(data, beautifulsoup, makeelement, **bsargs)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 79, in _parse
root = _convert_tree(tree, makeelement)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 152, in _convert_tree
res_root = convert_node(html_root)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 216, in convert_node
return handler(bs_node, parent)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 255, in convert_tag
handler(child, res)
[Previous line repeated 7 more times]
File "/usr/local/lib/python3.7/site-packages/lxml/html/soupparser.py", line 242, in convert_tag
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
File "src/lxml/etree.pyx", line 3136, in lxml.etree.SubElement
File "src/lxml/apihelpers.pxi", line 199, in lxml.etree._makeSubElement
File "src/lxml/apihelpers.pxi", line 194, in lxml.etree._makeSubElement
File "src/lxml/apihelpers.pxi", line 323, in lxml.etree._initNodeAttributes
File "src/lxml/apihelpers.pxi", line 334, in lxml.etree._addAttributeToNode
File "src/lxml/apihelpers.pxi", line 1540, in lxml.etree._utf8
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "facebook-scraper", line 8, in
sys.exit(run())
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/main.py", line 33, in run
write_posts_to_csv(**kwargs, filename=args.filename, pages=args.pages, encoding=args.encoding)
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/init.py", line 86, in write_posts_to_csv
list_of_posts = list(get_posts(account=account, group=group, **kwargs))
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/facebook_scraper.py", line 95, in _generic_get_posts
for i, page in zip(counter, iter_pages_fn()):
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/page_iterators.py", line 35, in generic_iter_pages
page = parser.get_page()
File "/Users/morbus/Desktop/facebook-scraper/facebook_scraper/page_iterators.py", line 67, in get_page
raw_posts = raw_page.find('article')
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 212, in find
for found in self.pq(selector)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 149, in pq
self._pq = PyQuery(self.lxml)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 162, in lxml
self._lxml = lxml.html.fromstring(self.raw_html)
File "/usr/local/lib/python3.7/site-packages/requests_html.py", line 97, in raw_html
return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding)
TypeError: encode() argument 1 must be str, not None
The text was updated successfully, but these errors were encountered: