Remove text from HTML files but keep javascript and structure using python

There are many ways to extract text from a html file, but I would like to do the opposite and remove the text until the structure and javascript code is saved.

For example, remove all

while maintaining

Is there an easy way to do this? Any help is appreciated. Greetings

+3


source to share


1 answer


I would go with BeautifulSoup:



from bs4 import BeautifulSoup
from bs4.element import NavigableString
from copy import copy

def strip_content(in_tag):
    tag = copy(in_tag) # remove this line if you don't care about your input
    if tag.name == 'script':
        # Do no mess with scripts
        return tag
    # strip content from all children
    children = [strip_content(child) for child in tag.children if not isinstance(child, NavigableString)]
    # remove everything from the tag
    tag.clear()
    for child in children:
        # Add back stripped children
        tag.append(child)
    return tag

def test(filename):
    soup = BeautifulSoup(open(filename))
    cleaned_soup = strip_content(soup)
    print(cleaned_soup.prettify())

if __name__ == "__main__":
    test("myfile.html")

      

+3


source







All Articles