9 examples of 'how to drop duplicate rows in pandas' in Python

Every line of 'how to drop duplicate rows in pandas' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
this disclaimer
14def drop_duplicate_events(df):
15 """
16 Function to group dataframe, use all new information from the latest row
17 but keep the ``event_index`` from the first one
18 """
19 df = df.sort_values('event_index', na_position='last')
20 event_index = df.event_index.iloc[0]
21 r = df.iloc[-1].to_dict()
22 r['event_index'] = event_index
23 return r
Important

Use secure code every time

Secure your code as it's written. Use Snyk Code to scan source code in minutes – no build needed – and fix issues immediately. Enable Snyk Code

49def remove_duplicates(df_or_series):
50 """ Remove duplicate rows or values by keeping the first of each duplicate.
51
52 Parameters
53 ----------
54 df_or_series : :any:`pandas.DataFrame` or :any:`pandas.Series`
55 Pandas object from which to drop duplicate index values.
56
57 Returns
58 -------
59 deduplicated : :any:`pandas.DataFrame` or :any:`pandas.Series`
60 The deduplicated pandas object.
61 """
62 # CalTrack 2.3.2.2
63 return df_or_series[~df_or_series.index.duplicated(keep="first")]
39def _drop_col(self, df):
40 '''
41 Drops last column, which was added in the parsing procedure due to a
42 trailing white space for each sample in the text file
43 Arguments:
44 df: pandas dataframe
45 Return:
46 df: original df with last column dropped
47 '''
48 return df.drop(df.columns[-1], axis=1)
80def _clean_columns(df, keep_colnames):
81 new_colnames = []
82 for i,colname in enumerate(df.columns):
83 if colname not in keep_colnames:
84 new_colnames.append(i)
85 else:
86 new_colnames.append(colname)
87 return new_colnames
673def _unique(df, columns=None):
674 if isinstance(columns, str):
675 columns = [columns]
676 if not columns:
677 columns = df.columns.tolist()
678 info = {}
679 for col in columns:
680 values = df[col].dropna().values
681 uniques = np.unique(list(_flatten_list(values))).tolist()
682 info[col] = {'count': len(uniques), 'values': uniques}
683 return info
114@staticmethod
115def combine_duplicate_rows(X_matrix, y_matrix, rowlabels):
116 X_unique, idxs, invs, cts = np.unique(X_matrix,
117 return_index=True,
118 return_inverse=True,
119 return_counts=True,
120 axis=0)
121 num_unique = X_unique.shape[0]
122 if num_unique == X_matrix.shape[0]:
123 # No duplicate rows
124 return X_matrix, y_matrix, rowlabels
125
126 # Combine duplicate rows
127 y_unique = np.empty((num_unique, y_matrix.shape[1]))
128 rowlabels_unique = np.empty(num_unique, dtype=tuple)
129 ix = np.arange(X_matrix.shape[0])
130 for i, count in enumerate(cts):
131 if count == 1:
132 y_unique[i, :] = y_matrix[idxs[i], :]
133 rowlabels_unique[i] = (rowlabels[idxs[i]],)
134 else:
135 dup_idxs = ix[invs == i]
136 y_unique[i, :] = np.median(y_matrix[dup_idxs, :], axis=0)
137 rowlabels_unique[i] = tuple(rowlabels[dup_idxs])
138 return X_unique, y_unique, rowlabels_unique
259def drop_some(df_: pd.DataFrame, thresh: int) -> pd.DataFrame:
260 # thresh is the minimum number of NA, the 1 indicates that columns should be dropped not rows
261 return df_.dropna(1, thresh=thresh)
332def dropcols(df, start=None, end=None):
333 """Drop columns that contain NaN within [start, end] inclusive.
334
335 A wrapper around DataFrame.dropna() that builds an easier *subset*
336 syntax for tseries-indexed DataFrames.
337
338 Parameters
339 ----------
340 df : DataFrame
341 start : str or datetime, default None
342 start cutoff date, inclusive
343 end : str or datetime, default None
344 end cutoff date, inclusive
345
346 Example
347 -------
348 df = DataFrame(np.random.randn(10,3),
349 index=pd.date_range('2017', periods=10))
350
351 # Drop in some NaN
352 df.set_value('2017-01-04', 0, np.nan)
353 df.set_value('2017-01-02', 2, np.nan)
354 df.loc['2017-01-05':, 1] = np.nan
355
356 # only col2 will be kept--its NaN value falls before `start`
357 print(dropcols(df, start='2017-01-03'))
358 2
359 2017-01-01 0.12939
360 2017-01-02 NaN
361 2017-01-03 0.16596
362 2017-01-04 1.06442
363 2017-01-05 -1.87040
364 2017-01-06 -0.17160
365 2017-01-07 0.94588
366 2017-01-08 1.49246
367 2017-01-09 0.02042
368 2017-01-10 0.75094
369
370 """
371
372 if isinstance(df, Series):
373 raise ValueError("func only applies to `pd.DataFrame`")
374 if start is None:
375 start = df.index[0]
376 if end is None:
377 end = df.index[-1]
378 subset = df.index[(df.index >= start) & (df.index <= end)]
379 return df.dropna(axis=1, subset=subset)
961def test_dupes_with_nulls():
962 df1 = pd.DataFrame(
963 {
964 "fld_1": [1, 2, 2, 3, 3, 4, 5, 5],
965 "fld_2": ["A", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
966 }
967 )
968 df2 = pd.DataFrame({"fld_1": [1, 2, 3, 4, 5], "fld_2": ["A", np.nan, np.nan, np.nan, np.nan]})
969 comp = datacompy.Compare(df1, df2, join_columns=["fld_1", "fld_2"])
970 assert comp.subset()

Related snippets