/
/
opt
/
gsutil
/
third_party
/
pyparsing
/
examples
Server: in-mum-web1112.main-hosting.eu (62.72.28.111)
You: 216.73.216.130
PHP 8.3.16
Dir:
/opt/gsutil/third_party/pyparsing/examples
Edit:
/opt/gsutil/third_party/pyparsing/examples/html_stripper.py
# # html_stripper.py # # Sample code for stripping HTML markup tags and scripts from # HTML source files. # # Copyright (c) 2006, 2016, 2023, Paul McGuire # from urllib.request import urlopen from pyparsing import ( LineEnd, quoted_string, make_html_tags, common_html_entity, replace_html_entity, html_comment, any_open_tag, any_close_tag, replace_with, ) # if <script> tags found, remove script content also script_open, script_close = make_html_tags("script") script_body = script_open + ... + script_close # translate HTML entities common_html_entity.set_parse_action(replace_html_entity) stripper = ( # parse quoted strings first, if they enclose HTML tags - keep these quoted_string # parse and translate HTML entities (&, <, >, etc.) | common_html_entity # expressions to be stripped - suppress() will remove them when transforming | ( html_comment | script_body | any_open_tag | any_close_tag ).suppress() ) repeated_newlines = LineEnd()[2, ...] repeated_newlines.set_parse_action(replace_with("\n\n")) if __name__ == '__main__': # get some HTML target_url = "https://wiki.python.org/moin/PythonDecoratorLibrary" with urlopen(target_url) as targetPage: target_html = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities # (use transform_string() instead of parse_string - will do # suppressions and parse actions) first_pass = stripper.transform_string(target_html) # first pass leaves many blank lines, collapse these down second_pass = repeated_newlines.transform_string(first_pass) print(second_pass)
Ukuran: 1.7 KB