10 examples of 'stratify in train_test_split' in Python

Every line of 'stratify in train_test_split' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
9def train_test_split(X, y, train_percentage=0.8):
10 '''
11 Very simple splitting into train and test data. Works for
12 any input shape without dependencies, but is a bit restricted.
13 '''
14 cut_idx = int(floor(X.shape[0] * 0.80))
15 X_train, X_test = X[:cut_idx], X[cut_idx:]
16 y_train, y_test = y[:cut_idx], y[cut_idx:]
17 print("Number of train samples", X_train.shape[0])
18 print("Number of test samples", X_test.shape[0])
19
20 return (X_train, y_train), (X_test, y_test)
9def train_test_split(fileName,type=1):
10 header = ['user_id', 'item_id', 'rating', 'timestamp']
11 if(type==1):
12 df = pd.read_csv(fileName, sep='\t', names=header)
13 else:
14 df = pd.read_csv(fileName, sep='::', names=header,engine = 'python')
15 n_users = df.user_id.unique().shape[0]
16 users = df.user_id.max()
17 n_items = df.item_id.unique().shape[0]
18 items = df.item_id.max()
19
20 print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)
21 print 'The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items)
22 #
23 train_data, test_data = cv.train_test_split(df, test_size=0.1)
24 train_data = pd.DataFrame(train_data)
25 test_data = pd.DataFrame(test_data)
26 #Create two user-item matrices, one for training and another for testing
27 train_data_matrix = np.zeros((users, items))
28 for line in train_data.itertuples():
29 train_data_matrix[line[1]-1, line[2]-1] = line[3]
30
31 test_data_matrix = np.zeros((users, items))
32 for line in test_data.itertuples():
33 test_data_matrix[line[1]-1, line[2]-1] = line[3]
34 return train_data_matrix,test_data_matrix
423def _train_val_split(df, validation):
424 train_df = df
425 val_df = None
426 validation_ratio = 0.0
427
428 if isinstance(validation, float) and validation > 0:
429 train_df, val_df = train_df.randomSplit([1.0 - validation, validation])
430 validation_ratio = validation
431 elif isinstance(validation, str):
432 dtype = [field.dataType for field in df.schema.fields if field.name == validation][0]
433 bool_dtype = isinstance(dtype, BooleanType)
434 val_df = train_df.filter(
435 f.col(validation) if bool_dtype else f.col(validation) > 0).drop(validation)
436 train_df = train_df.filter(
437 ~f.col(validation) if bool_dtype else f.col(validation) == 0).drop(validation)
438
439 # Approximate ratio of validation data to training data for proportionate scale
440 # of partitions
441 timeout_ms = 1000
442 confidence = 0.90
443 train_rows = train_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence)
444 val_rows = val_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence)
445 validation_ratio = val_rows / (val_rows + train_rows)
446 elif validation:
447 raise ValueError('Unrecognized validation type: {}'.format(type(validation)))
448
449 return train_df, val_df, validation_ratio
75def _get_adapted_dataset(split):
76 """ Gets the adapted dataset for the experiments
77
78 Args :
79 split (str): train or test
80 Returns :
81 (tuple): images and labels
82 """
83 dataset = _get_dataset()
84 key_img = 'x_' + split
85 key_lbl = 'y_' + split
86
87 if split != 'train':
88 dataset[key_img], dataset[key_lbl] = _adapt(dataset[key_img],
89 dataset[key_lbl])
90
91 return (dataset[key_img], dataset[key_lbl])
134def test_split(self):
135 """
136 Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4
137 and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero.
138 Inspection shows that the pairs test-train sets should respectively be
139 [...]
140 3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9,
141 10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set.
142 4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4,
143 5, 16, 17, 18, 19, 20]. Sample 6 is embargoed.
144 [...]
145 """
146 cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2)
147 prepare_time_inhomogeneous_cv_object(cv)
148 count = 0
149 for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times):
150 count += 1
151 if count == 3:
152 result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20])
153 result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
154 self.assertTrue(np.array_equal(result_train, train_set))
155 self.assertTrue(np.array_equal(result_test, test_set))
156 if count == 4:
157 result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15])
158 result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20])
159 self.assertTrue(np.array_equal(result_train, train_set))
160 self.assertTrue(np.array_equal(result_test, test_set))
87def train_val_test_split(id_lists, train_fraction, val_fraction, test_fraction):
88 train_ids = []
89 val_ids = []
90 test_ids = []
91
92 for dataset_idx, id_list in enumerate(id_lists):
93 print('dataset', dataset_idx, 'contains', len(id_lists), 'items.')
94 train, val, test = make_splits(id_list, [train_fraction, val_fraction, test_fraction])
95 train_ids += train
96 val_ids += val
97 test_ids += test
98 print('train_ids', len(train_ids), 'val_ids', len(val_ids), 'test_ids', len(test_ids))
99
100 return {'train': train_ids, 'valid': val_ids, 'test': test_ids}
6def split(df):
7 '''
8
9 :param df: Dataframe to be splited
10 :return: Sorted list of dataframe's splited list
11 '''
12 trainingSet, testSet = train_test_split(df, test_size=0.2)
13 sorted_trainSet = trainingSet.sort_values('user_id')
14 sorted_testSet = testSet.sort_values('user_id')
15 return sorted_testSet, sorted_trainSet
35def split_data(df):
36 X = df.drop('Y', axis=1).values
37 y = df['Y'].values
38
39 X_train, X_test, y_train, y_test = train_test_split(
40 X, y, test_size=0.2, random_state=0)
41 data = {"train": {"X": X_train, "y": y_train},
42 "test": {"X": X_test, "y": y_test}}
43 return data
82def my_train_split(ds, y):
83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
33def train_test_split_result(clf, X, y):
34 print("This is Random and Percentaged Spilt Result ... ")
35 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y)
36 clf = clf.fit(X_train, y_train)
37 report_result(clf, X_test, y_test, y_train)

Related snippets