You can use the Path.glob
function from the pathlib
standard library module.
For example:
from pathlib import Path
def get_soup(html_file_path): # added argument
f = html_file_path.open()
return BeautifulSoup(f, "lxml")
def get_all_tables(soup):
return soup.find_all("table")
def get_all_html_files(root_path):
return Path(root_path).glob("**/*.html")
if __name__ == "__main__":
html_root = Path("./html_files/") # This is the folder with the html files
for html_file in get_all_html_files(html_root):
soup = get_soup(html_file)
tables = get_all_tables(soup)
I am still new to coding. I needed to write a code to iterate through a data folder of many html files and execute a predefined function (Extracting specific tables from HTML document). I used bs4 to parse the html file. The proposed solution below allowed to retrieve the files and extract the tables from each html file.
from bs4 import BeautifulSoup
import glob
def get_soup(html_file_path):
f = html_file_path.open()
return BeautifulSoup(f, "lxml")
def get_all_tables(soup):
return soup.find_all("table")
def get_all_html_files(root_path):
return Path(root_path).glob("**/*.html")
if __name__ == "__main__":
html_root = Path("data_file_pathname/")
soup = get_soup(html_file)
tables = get_all_tables(soup)
print(f"[+] Found a total of {len(tables)} tables.")
Thanks