Every line of 'stratify in train_test_split' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
9 def train_test_split(X, y, train_percentage=0.8): 10 ''' 11 Very simple splitting into train and test data. Works for 12 any input shape without dependencies, but is a bit restricted. 13 ''' 14 cut_idx = int(floor(X.shape[0] * 0.80)) 15 X_train, X_test = X[:cut_idx], X[cut_idx:] 16 y_train, y_test = y[:cut_idx], y[cut_idx:] 17 print("Number of train samples", X_train.shape[0]) 18 print("Number of test samples", X_test.shape[0]) 19 20 return (X_train, y_train), (X_test, y_test)
9 def train_test_split(fileName,type=1): 10 header = ['user_id', 'item_id', 'rating', 'timestamp'] 11 if(type==1): 12 df = pd.read_csv(fileName, sep='\t', names=header) 13 else: 14 df = pd.read_csv(fileName, sep='::', names=header,engine = 'python') 15 n_users = df.user_id.unique().shape[0] 16 users = df.user_id.max() 17 n_items = df.item_id.unique().shape[0] 18 items = df.item_id.max() 19 20 print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) 21 print 'The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items) 22 # 23 train_data, test_data = cv.train_test_split(df, test_size=0.1) 24 train_data = pd.DataFrame(train_data) 25 test_data = pd.DataFrame(test_data) 26 #Create two user-item matrices, one for training and another for testing 27 train_data_matrix = np.zeros((users, items)) 28 for line in train_data.itertuples(): 29 train_data_matrix[line[1]-1, line[2]-1] = line[3] 30 31 test_data_matrix = np.zeros((users, items)) 32 for line in test_data.itertuples(): 33 test_data_matrix[line[1]-1, line[2]-1] = line[3] 34 return train_data_matrix,test_data_matrix
423 def _train_val_split(df, validation): 424 train_df = df 425 val_df = None 426 validation_ratio = 0.0 427 428 if isinstance(validation, float) and validation > 0: 429 train_df, val_df = train_df.randomSplit([1.0 - validation, validation]) 430 validation_ratio = validation 431 elif isinstance(validation, str): 432 dtype = [field.dataType for field in df.schema.fields if field.name == validation][0] 433 bool_dtype = isinstance(dtype, BooleanType) 434 val_df = train_df.filter( 435 f.col(validation) if bool_dtype else f.col(validation) > 0).drop(validation) 436 train_df = train_df.filter( 437 ~f.col(validation) if bool_dtype else f.col(validation) == 0).drop(validation) 438 439 # Approximate ratio of validation data to training data for proportionate scale 440 # of partitions 441 timeout_ms = 1000 442 confidence = 0.90 443 train_rows = train_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 444 val_rows = val_df.rdd.countApprox(timeout=timeout_ms, confidence=confidence) 445 validation_ratio = val_rows / (val_rows + train_rows) 446 elif validation: 447 raise ValueError('Unrecognized validation type: {}'.format(type(validation))) 448 449 return train_df, val_df, validation_ratio
75 def _get_adapted_dataset(split): 76 """ Gets the adapted dataset for the experiments 77 78 Args : 79 split (str): train or test 80 Returns : 81 (tuple): images and labels 82 """ 83 dataset = _get_dataset() 84 key_img = 'x_' + split 85 key_lbl = 'y_' + split 86 87 if split != 'train': 88 dataset[key_img], dataset[key_lbl] = _adapt(dataset[key_img], 89 dataset[key_lbl]) 90 91 return (dataset[key_img], dataset[key_lbl])
134 def test_split(self): 135 """ 136 Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4 137 and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero. 138 Inspection shows that the pairs test-train sets should respectively be 139 [...] 140 3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9, 141 10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set. 142 4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4, 143 5, 16, 17, 18, 19, 20]. Sample 6 is embargoed. 144 [...] 145 """ 146 cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2) 147 prepare_time_inhomogeneous_cv_object(cv) 148 count = 0 149 for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times): 150 count += 1 151 if count == 3: 152 result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20]) 153 result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) 154 self.assertTrue(np.array_equal(result_train, train_set)) 155 self.assertTrue(np.array_equal(result_test, test_set)) 156 if count == 4: 157 result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15]) 158 result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20]) 159 self.assertTrue(np.array_equal(result_train, train_set)) 160 self.assertTrue(np.array_equal(result_test, test_set))
87 def train_val_test_split(id_lists, train_fraction, val_fraction, test_fraction): 88 train_ids = [] 89 val_ids = [] 90 test_ids = [] 91 92 for dataset_idx, id_list in enumerate(id_lists): 93 print('dataset', dataset_idx, 'contains', len(id_lists), 'items.') 94 train, val, test = make_splits(id_list, [train_fraction, val_fraction, test_fraction]) 95 train_ids += train 96 val_ids += val 97 test_ids += test 98 print('train_ids', len(train_ids), 'val_ids', len(val_ids), 'test_ids', len(test_ids)) 99 100 return {'train': train_ids, 'valid': val_ids, 'test': test_ids}
6 def split(df): 7 ''' 8 9 :param df: Dataframe to be splited 10 :return: Sorted list of dataframe's splited list 11 ''' 12 trainingSet, testSet = train_test_split(df, test_size=0.2) 13 sorted_trainSet = trainingSet.sort_values('user_id') 14 sorted_testSet = testSet.sort_values('user_id') 15 return sorted_testSet, sorted_trainSet
35 def split_data(df): 36 X = df.drop('Y', axis=1).values 37 y = df['Y'].values 38 39 X_train, X_test, y_train, y_test = train_test_split( 40 X, y, test_size=0.2, random_state=0) 41 data = {"train": {"X": X_train, "y": y_train}, 42 "test": {"X": X_test, "y": y_test}} 43 return data
82 def my_train_split(ds, y): 83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
33 def train_test_split_result(clf, X, y): 34 print("This is Random and Percentaged Spilt Result ... ") 35 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y) 36 clf = clf.fit(X_train, y_train) 37 report_result(clf, X_test, y_test, y_train)