Every line of 'how to drop duplicate rows in pandas' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
14 def drop_duplicate_events(df): 15 """ 16 Function to group dataframe, use all new information from the latest row 17 but keep the ``event_index`` from the first one 18 """ 19 df = df.sort_values('event_index', na_position='last') 20 event_index = df.event_index.iloc[0] 21 r = df.iloc[-1].to_dict() 22 r['event_index'] = event_index 23 return r
49 def remove_duplicates(df_or_series): 50 """ Remove duplicate rows or values by keeping the first of each duplicate. 51 52 Parameters 53 ---------- 54 df_or_series : :any:`pandas.DataFrame` or :any:`pandas.Series` 55 Pandas object from which to drop duplicate index values. 56 57 Returns 58 ------- 59 deduplicated : :any:`pandas.DataFrame` or :any:`pandas.Series` 60 The deduplicated pandas object. 61 """ 62 # CalTrack 2.3.2.2 63 return df_or_series[~df_or_series.index.duplicated(keep="first")]
39 def _drop_col(self, df): 40 ''' 41 Drops last column, which was added in the parsing procedure due to a 42 trailing white space for each sample in the text file 43 Arguments: 44 df: pandas dataframe 45 Return: 46 df: original df with last column dropped 47 ''' 48 return df.drop(df.columns[-1], axis=1)
80 def _clean_columns(df, keep_colnames): 81 new_colnames = [] 82 for i,colname in enumerate(df.columns): 83 if colname not in keep_colnames: 84 new_colnames.append(i) 85 else: 86 new_colnames.append(colname) 87 return new_colnames
673 def _unique(df, columns=None): 674 if isinstance(columns, str): 675 columns = [columns] 676 if not columns: 677 columns = df.columns.tolist() 678 info = {} 679 for col in columns: 680 values = df[col].dropna().values 681 uniques = np.unique(list(_flatten_list(values))).tolist() 682 info[col] = {'count': len(uniques), 'values': uniques} 683 return info
114 @staticmethod 115 def combine_duplicate_rows(X_matrix, y_matrix, rowlabels): 116 X_unique, idxs, invs, cts = np.unique(X_matrix, 117 return_index=True, 118 return_inverse=True, 119 return_counts=True, 120 axis=0) 121 num_unique = X_unique.shape[0] 122 if num_unique == X_matrix.shape[0]: 123 # No duplicate rows 124 return X_matrix, y_matrix, rowlabels 125 126 # Combine duplicate rows 127 y_unique = np.empty((num_unique, y_matrix.shape[1])) 128 rowlabels_unique = np.empty(num_unique, dtype=tuple) 129 ix = np.arange(X_matrix.shape[0]) 130 for i, count in enumerate(cts): 131 if count == 1: 132 y_unique[i, :] = y_matrix[idxs[i], :] 133 rowlabels_unique[i] = (rowlabels[idxs[i]],) 134 else: 135 dup_idxs = ix[invs == i] 136 y_unique[i, :] = np.median(y_matrix[dup_idxs, :], axis=0) 137 rowlabels_unique[i] = tuple(rowlabels[dup_idxs]) 138 return X_unique, y_unique, rowlabels_unique
259 def drop_some(df_: pd.DataFrame, thresh: int) -> pd.DataFrame: 260 # thresh is the minimum number of NA, the 1 indicates that columns should be dropped not rows 261 return df_.dropna(1, thresh=thresh)
332 def dropcols(df, start=None, end=None): 333 """Drop columns that contain NaN within [start, end] inclusive. 334 335 A wrapper around DataFrame.dropna() that builds an easier *subset* 336 syntax for tseries-indexed DataFrames. 337 338 Parameters 339 ---------- 340 df : DataFrame 341 start : str or datetime, default None 342 start cutoff date, inclusive 343 end : str or datetime, default None 344 end cutoff date, inclusive 345 346 Example 347 ------- 348 df = DataFrame(np.random.randn(10,3), 349 index=pd.date_range('2017', periods=10)) 350 351 # Drop in some NaN 352 df.set_value('2017-01-04', 0, np.nan) 353 df.set_value('2017-01-02', 2, np.nan) 354 df.loc['2017-01-05':, 1] = np.nan 355 356 # only col2 will be kept--its NaN value falls before `start` 357 print(dropcols(df, start='2017-01-03')) 358 2 359 2017-01-01 0.12939 360 2017-01-02 NaN 361 2017-01-03 0.16596 362 2017-01-04 1.06442 363 2017-01-05 -1.87040 364 2017-01-06 -0.17160 365 2017-01-07 0.94588 366 2017-01-08 1.49246 367 2017-01-09 0.02042 368 2017-01-10 0.75094 369 370 """ 371 372 if isinstance(df, Series): 373 raise ValueError("func only applies to `pd.DataFrame`") 374 if start is None: 375 start = df.index[0] 376 if end is None: 377 end = df.index[-1] 378 subset = df.index[(df.index >= start) & (df.index <= end)] 379 return df.dropna(axis=1, subset=subset)
961 def test_dupes_with_nulls(): 962 df1 = pd.DataFrame( 963 { 964 "fld_1": [1, 2, 2, 3, 3, 4, 5, 5], 965 "fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], 966 } 967 ) 968 df2 = pd.DataFrame({"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]}) 969 comp = datacompy.Compare(df1, df2, join_columns=["fld_1", "fld_2"]) 970 assert comp.subset()