55 lines
1.4 KiB
Python
55 lines
1.4 KiB
Python
# noqa: D100
|
|
from html.parser import HTMLParser
|
|
import re
|
|
|
|
|
|
def chunks(lst, n):
|
|
"""Yield successive n-sized chunks from lst."""
|
|
for i in range(0, len(lst), n):
|
|
yield lst[i : i + n]
|
|
|
|
|
|
class HTMLFilter(HTMLParser):
|
|
"""Filter human-readable text portions of HTML."""
|
|
|
|
skip_text_of = ('a', 'style')
|
|
text = ''
|
|
skip_tag_text = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
"""Act on start tag."""
|
|
if tag in self.skip_text_of:
|
|
self.skip_tag_text = True
|
|
for name, value in attrs:
|
|
if name == 'href':
|
|
self.skip_tag_text = True
|
|
self.text += value
|
|
if tag in ('quote', 'q'):
|
|
self.text += '“'
|
|
|
|
def handle_endtag(self, tag):
|
|
"""Act on end tag."""
|
|
if tag in self.skip_text_of:
|
|
self.skip_tag_text = False
|
|
if tag in ('quote', 'q'):
|
|
self.text += '”\n\n'
|
|
|
|
def handle_data(self, data):
|
|
"""Append text."""
|
|
if self.skip_tag_text:
|
|
return
|
|
self.text += data
|
|
|
|
|
|
def html_to_text(data: str) -> str:
|
|
"""Return a human-readable text made from given HTML."""
|
|
f = HTMLFilter()
|
|
f.feed(data)
|
|
lines = [_.lstrip(' \t') for _ in f.text.split('\n')]
|
|
skip_empty = 0
|
|
for line in lines:
|
|
if not re.match(r'^\s*$', line):
|
|
break
|
|
skip_empty += 1
|
|
return '\n'.join(lines[skip_empty:]).strip()
|