8 examples of 'parsing html table in python with beautifulsoup' in Python

Every line of 'parsing html table in python with beautifulsoup' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
173def parse_table(html):
174 root = Node(None, 'root', '', html)
175 fd = io.StringIO()
176 root.gen(fd)
177 return fd.getvalue()
107def parse_html_table(self, table):
108 n_columns = 0
109 n_rows = 0
110 column_names = []
111
112 # Find number of rows and columns
113 # we also find the column titles if we can
114 for row in table.find_all("tr"):
115
116 # Determine the number of rows in the table
117 td_tags = row.find_all("td")
118 if len(td_tags) > 0:
119 n_rows += 1
120 if n_columns == 0:
121 # Set the number of columns for our table
122 n_columns = len(td_tags)
123
124 # Handle column names if we find them
125 th_tags = row.find_all("th")
126 if len(th_tags) > 0 and len(column_names) == 0:
127 for th in th_tags:
128 column_names.append(th.get_text())
129
130 # Safeguard on Column Titles
131 if len(column_names) > 0 and len(column_names) != n_columns:
132 raise Exception("Column titles do not match the number of columns")
133
134 columns = column_names if len(column_names) > 0 else range(0, n_columns)
135 df = pd.DataFrame(columns=columns, index=range(0, n_rows))
136 row_marker = 0
137 for row in table.find_all("tr"):
138 column_marker = 0
139 columns = row.find_all("td")
140 for column in columns:
141 df.iat[row_marker, column_marker] = column.get_text()
142 column_marker += 1
143 if len(columns) > 0:
144 row_marker += 1
145
146 # Convert to float if possible
147 for col in df:
148 try:
149 df[col] = df[col].astype(float)
150 except ValueError:
151 pass
152
153 return df
80def _parse_single_table(self, table: object,
81 hierarchy: str) -> dict:
82 table = table.find("tr")
83 columns = table.find_all("td")
84 title = columns[1].contents[0].strip()
85 grade = self._strip_cell(columns[2].text)
86 values = [grade]
87
88 if (len(columns) > 3
89 and "pokaż szczegóły" not in columns[3].text):
90 values.append(self._strip_cell(columns[3].text))
91
92 self._tree_entries.append({
93 "group": self._group,
94 "subgroup": self._subgroup,
95 "hierarchy": hierarchy[2:],
96 "item": title,
97 "values": values
98 })
129def __init__(self, html_text: str, auto_parse=False):
130 self.html_text = html_text
131 self.auto_parse = auto_parse
132
133 self.soup = BeautifulSoup(self.html_text, HTML_PARSER)
134
135 if auto_parse:
136 self.parse()
137 self.tables = []
138 for table_tag in self.soupy_table_tags:
139 self.tables.extend(table_tag.tables)
140 for enum_tag in self.soupy_enumeration_tags:
141 self.tables.extend(enum_tag.tables)
10def html_parser(html):
11 try:
12 soup = BeautifulSoup(html)
13 except:
14 parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
15 soup = parser.parse(html)
16 return soup
47def test_thead_tbody(self):
48 ''' Contains thead and tbody '''
49
50 table = HTMLTableTest.tables[2]
51
52 self.assertEqual(table.name, "Joseph Stalin")
53 self.assertEqual(table.col_names,
54 HTMLTableTest.correct_col_names)
55 self.assertEqual(table[0],
56 HTMLTableTest.correct_first_row)
89def _parse_html_table_of_contents(html):
90 """
91 Given a table of contents string that has been automatically generated by
92 the markdown library, parse it into a tree of AnchorLink instances.
93
94 Returns a list of all the parent AnchorLink instances.
95 """
96 lines = html.splitlines()[2:-2]
97 ret, parents, level = [], [], 0
98 for line in lines:
99 parser = _TOCParser()
100 parser.feed(line)
101 if parser.title:
102 try:
103 href = parser.attrs['href']
104 except KeyError:
105 continue
106 title = parser.title
107 nav = AnchorLink(title, href, level)
108 # Add the item to its parent if required. If it is a topmost
109 # item then instead append it to our return value.
110 if parents:
111 parents[-1].children.append(nav)
112 else:
113 ret.append(nav)
114 # If this item has children, store it as the current parent
115 if line.endswith('<ul>'):
116 level += 1
117 parents.append(nav)
118 elif line.startswith('</ul>'):
119 level -= 1
120 if parents:
121 parents.pop()
122
123 # For the table of contents, always mark the first element as active
124 if ret:
125 ret[0].active = True
126
127 return ret
5def init_bs(html):
6 return bs4.BeautifulSoup(html, features="html5lib")

Related snippets