Remove text from HTML files but keep javascript and structure using python
1 answer
I would go with BeautifulSoup:
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from copy import copy
def strip_content(in_tag):
tag = copy(in_tag) # remove this line if you don't care about your input
if tag.name == 'script':
# Do no mess with scripts
return tag
# strip content from all children
children = [strip_content(child) for child in tag.children if not isinstance(child, NavigableString)]
# remove everything from the tag
tag.clear()
for child in children:
# Add back stripped children
tag.append(child)
return tag
def test(filename):
soup = BeautifulSoup(open(filename))
cleaned_soup = strip_content(soup)
print(cleaned_soup.prettify())
if __name__ == "__main__":
test("myfile.html")
+3
source to share