905 | def filter(self, func): |
906 | """ |
907 | Return a copy of a DataFrame excluding elements from groups that |
908 | do not satisfy the boolean criterion specified by func. |
909 | |
910 | Parameters |
911 | ---------- |
912 | f : function |
913 | Function to apply to each subframe. Should return True or False. |
914 | dropna : Drop groups that do not pass the filter. True by default; |
915 | if False, groups that evaluate False are filled with NaNs. |
916 | |
917 | Returns |
918 | ------- |
919 | filtered : DataFrame |
920 | |
921 | Notes |
922 | ----- |
923 | Each subframe is endowed the attribute 'name' in case you need to know |
924 | which group you are working on. |
925 | |
926 | Examples |
927 | -------- |
928 | >>> df = ks.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', |
929 | ... 'foo', 'bar'], |
930 | ... 'B' : [1, 2, 3, 4, 5, 6], |
931 | ... 'C' : [2.0, 5., 8., 1., 2., 9.]}, columns=['A', 'B', 'C']) |
932 | >>> grouped = df.groupby('A') |
933 | >>> grouped.filter(lambda x: x['B'].mean() > 3.) |
934 | A B C |
935 | 1 bar 2 5.0 |
936 | 3 bar 4 1.0 |
937 | 5 bar 6 9.0 |
938 | """ |
939 | if not isinstance(func, Callable): |
940 | raise TypeError("%s object is not callable" % type(func)) |
941 | |
942 | data_schema = self._kdf._sdf.schema |
943 | groupby_names = [s.name for s in self._groupkeys] |
944 | |
945 | def pandas_filter(pdf): |
946 | return pdf.groupby(groupby_names).filter(func) |
947 | |
948 | sdf = self._spark_group_map_apply( |
949 | pandas_filter, data_schema, retain_index=True) |
950 | return DataFrame(self._kdf._internal.copy( |
951 | sdf=sdf, |
952 | column_scols=[scol_for(sdf, col) for col in self._kdf._internal.data_columns])) |