Every line of 'parsing html table in python with beautifulsoup' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
173 def parse_table(html): 174 root = Node(None, 'root', '', html) 175 fd = io.StringIO() 176 root.gen(fd) 177 return fd.getvalue()
107 def parse_html_table(self, table): 108 n_columns = 0 109 n_rows = 0 110 column_names = [] 111 112 # Find number of rows and columns 113 # we also find the column titles if we can 114 for row in table.find_all("tr"): 115 116 # Determine the number of rows in the table 117 td_tags = row.find_all("td") 118 if len(td_tags) > 0: 119 n_rows += 1 120 if n_columns == 0: 121 # Set the number of columns for our table 122 n_columns = len(td_tags) 123 124 # Handle column names if we find them 125 th_tags = row.find_all("th") 126 if len(th_tags) > 0 and len(column_names) == 0: 127 for th in th_tags: 128 column_names.append(th.get_text()) 129 130 # Safeguard on Column Titles 131 if len(column_names) > 0 and len(column_names) != n_columns: 132 raise Exception("Column titles do not match the number of columns") 133 134 columns = column_names if len(column_names) > 0 else range(0, n_columns) 135 df = pd.DataFrame(columns=columns, index=range(0, n_rows)) 136 row_marker = 0 137 for row in table.find_all("tr"): 138 column_marker = 0 139 columns = row.find_all("td") 140 for column in columns: 141 df.iat[row_marker, column_marker] = column.get_text() 142 column_marker += 1 143 if len(columns) > 0: 144 row_marker += 1 145 146 # Convert to float if possible 147 for col in df: 148 try: 149 df[col] = df[col].astype(float) 150 except ValueError: 151 pass 152 153 return df
80 def _parse_single_table(self, table: object, 81 hierarchy: str) -> dict: 82 table = table.find("tr") 83 columns = table.find_all("td") 84 title = columns[1].contents[0].strip() 85 grade = self._strip_cell(columns[2].text) 86 values = [grade] 87 88 if (len(columns) > 3 89 and "pokaż szczegóły" not in columns[3].text): 90 values.append(self._strip_cell(columns[3].text)) 91 92 self._tree_entries.append({ 93 "group": self._group, 94 "subgroup": self._subgroup, 95 "hierarchy": hierarchy[2:], 96 "item": title, 97 "values": values 98 })
129 def __init__(self, html_text: str, auto_parse=False): 130 self.html_text = html_text 131 self.auto_parse = auto_parse 132 133 self.soup = BeautifulSoup(self.html_text, HTML_PARSER) 134 135 if auto_parse: 136 self.parse() 137 self.tables = [] 138 for table_tag in self.soupy_table_tags: 139 self.tables.extend(table_tag.tables) 140 for enum_tag in self.soupy_enumeration_tags: 141 self.tables.extend(enum_tag.tables)
10 def html_parser(html): 11 try: 12 soup = BeautifulSoup(html) 13 except: 14 parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) 15 soup = parser.parse(html) 16 return soup
47 def test_thead_tbody(self): 48 ''' Contains thead and tbody ''' 49 50 table = HTMLTableTest.tables[2] 51 52 self.assertEqual(table.name, "Joseph Stalin") 53 self.assertEqual(table.col_names, 54 HTMLTableTest.correct_col_names) 55 self.assertEqual(table[0], 56 HTMLTableTest.correct_first_row)
89 def _parse_html_table_of_contents(html): 90 """ 91 Given a table of contents string that has been automatically generated by 92 the markdown library, parse it into a tree of AnchorLink instances. 93 94 Returns a list of all the parent AnchorLink instances. 95 """ 96 lines = html.splitlines()[2:-2] 97 ret, parents, level = [], [], 0 98 for line in lines: 99 parser = _TOCParser() 100 parser.feed(line) 101 if parser.title: 102 try: 103 href = parser.attrs['href'] 104 except KeyError: 105 continue 106 title = parser.title 107 nav = AnchorLink(title, href, level) 108 # Add the item to its parent if required. If it is a topmost 109 # item then instead append it to our return value. 110 if parents: 111 parents[-1].children.append(nav) 112 else: 113 ret.append(nav) 114 # If this item has children, store it as the current parent 115 if line.endswith('<ul>'): 116 level += 1 117 parents.append(nav) 118 elif line.startswith('</ul>'): 119 level -= 1 120 if parents: 121 parents.pop() 122 123 # For the table of contents, always mark the first element as active 124 if ret: 125 ret[0].active = True 126 127 return ret
5 def init_bs(html): 6 return bs4.BeautifulSoup(html, features="html5lib")