Every line of 'train_test_split python' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
82 def my_train_split(ds, y): 83 return ds, skorch.dataset.Dataset(corpus.valid[:200], y=None)
9 def train_test_split(fileName,type=1): 10 header = ['user_id', 'item_id', 'rating', 'timestamp'] 11 if(type==1): 12 df = pd.read_csv(fileName, sep='\t', names=header) 13 else: 14 df = pd.read_csv(fileName, sep='::', names=header,engine = 'python') 15 n_users = df.user_id.unique().shape[0] 16 users = df.user_id.max() 17 n_items = df.item_id.unique().shape[0] 18 items = df.item_id.max() 19 20 print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) 21 print 'The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items) 22 # 23 train_data, test_data = cv.train_test_split(df, test_size=0.1) 24 train_data = pd.DataFrame(train_data) 25 test_data = pd.DataFrame(test_data) 26 #Create two user-item matrices, one for training and another for testing 27 train_data_matrix = np.zeros((users, items)) 28 for line in train_data.itertuples(): 29 train_data_matrix[line[1]-1, line[2]-1] = line[3] 30 31 test_data_matrix = np.zeros((users, items)) 32 for line in test_data.itertuples(): 33 test_data_matrix[line[1]-1, line[2]-1] = line[3] 34 return train_data_matrix,test_data_matrix
23 def train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR, train_size=0.8, valid_size=0.1, 24 COMBINE_FOLDERS=None, SELECT_FOLDERS=None): 25 """ 26 Usage: 27 SOURCE_DATA_DIR = "data/ClothingAttributeDataset/images/" 28 TARGET_DATA_DIR = "data/" 29 30 train_valid_test_split(SOURCE_DATA_DIR, TARGET_DATA_DIR) 31 """ 32 if COMBINE_FOLDERS is None: 33 COMBINE_FOLDERS = dict() 34 for folder_name in ["train", "test", "valid"]: 35 rmtree(os.path.join(TARGET_DATA_DIR, folder_name), ignore_errors=True) 36 os.makedirs(os.path.join(TARGET_DATA_DIR, folder_name)) 37 38 # Split records by 80-20 between Train and Validation Set 39 filenames = np.random.permutation(glob(os.path.join(SOURCE_DATA_DIR, "*.jpg"))) 40 41 train_idx = int(len(filenames) * train_size) 42 test_idx = int(len(filenames) * (train_size+valid_size)) 43 for idx, filename in enumerate(filenames): 44 target_name = filename.split("/")[-1] 45 if idx < train_idx: 46 target_filepath = os.path.join(TARGET_DATA_DIR, "train", target_name) 47 elif idx < test_idx: 48 target_filepath = os.path.join(TARGET_DATA_DIR, "valid", target_name) 49 else: 50 target_filepath = os.path.join(TARGET_DATA_DIR, "test", target_name) 51 copyfile(filenames[idx], target_filepath)
9 def train_test_split(X, y, train_percentage=0.8): 10 ''' 11 Very simple splitting into train and test data. Works for 12 any input shape without dependencies, but is a bit restricted. 13 ''' 14 cut_idx = int(floor(X.shape[0] * 0.80)) 15 X_train, X_test = X[:cut_idx], X[cut_idx:] 16 y_train, y_test = y[:cut_idx], y[cut_idx:] 17 print("Number of train samples", X_train.shape[0]) 18 print("Number of test samples", X_test.shape[0]) 19 20 return (X_train, y_train), (X_test, y_test)
134 def test_split(self): 135 """ 136 Apply split to the sample described in the docstring of prepare_time_inhomogeneous_cv_object, with n_splits = 4 137 and n_test_splits = 2. The folds are [0 : 6], [6 : 11], [11 : 16], [16 : 21]. We use an embargo of zero. 138 Inspection shows that the pairs test-train sets should respectively be 139 [...] 140 3. Train: folds 1 and 4, samples [0, 1, 2, 3, 4, 16, 17, 18, 19, 20]. Test: folds 2 and 3, samples [6, 7, 8, 9, 141 10, 11, 12, 13, 14, 15]. Sample 5 is purged from the train set. 142 4. Train: folds 2 and 3, samples [7, 8, 9, 10, 11, 12, 13, 14, 15]. Test: folds 1 and 4, samples [0, 1, 2, 3, 4, 143 5, 16, 17, 18, 19, 20]. Sample 6 is embargoed. 144 [...] 145 """ 146 cv = CombPurgedKFoldCV(n_splits=4, n_test_splits=2) 147 prepare_time_inhomogeneous_cv_object(cv) 148 count = 0 149 for train_set, test_set in cv.split(cv.X, pred_times=cv.pred_times, eval_times=cv.eval_times): 150 count += 1 151 if count == 3: 152 result_train = np.array([0, 1, 2, 3, 4, 16, 17, 18, 19, 20]) 153 result_test = np.array([6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) 154 self.assertTrue(np.array_equal(result_train, train_set)) 155 self.assertTrue(np.array_equal(result_test, test_set)) 156 if count == 4: 157 result_train = np.array([7, 8, 9, 10, 11, 12, 13, 14, 15]) 158 result_test = np.array([0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20]) 159 self.assertTrue(np.array_equal(result_train, train_set)) 160 self.assertTrue(np.array_equal(result_test, test_set))
480 def train_valid_split(dataset, validation_amount): 481 valid_length = int(validation_amount * len(dataset)) 482 train_length = len(dataset) - valid_length 483 484 train_dataset, valid_dataset = random_split(dataset, [train_length, valid_length]) 485 return train_dataset, valid_dataset
6 def split(df): 7 ''' 8 9 :param df: Dataframe to be splited 10 :return: Sorted list of dataframe's splited list 11 ''' 12 trainingSet, testSet = train_test_split(df, test_size=0.2) 13 sorted_trainSet = trainingSet.sort_values('user_id') 14 sorted_testSet = testSet.sort_values('user_id') 15 return sorted_testSet, sorted_trainSet
35 def split_data(df): 36 X = df.drop('Y', axis=1).values 37 y = df['Y'].values 38 39 X_train, X_test, y_train, y_test = train_test_split( 40 X, y, test_size=0.2, random_state=0) 41 data = {"train": {"X": X_train, "y": y_train}, 42 "test": {"X": X_test, "y": y_test}} 43 return data
48 def _split_train_test(self): 49 ''' 50 Splitting data into train and test parts. 51 ''' 52 53 for (trainindices, testindices) in \ 54 self.foldsplitter(self.eventmemes_all, 55 [self.train_fraction]): 56 trainetimes = self.etimes_all[trainindices] 57 traininfecting_vec = self.infecting_vec_all[trainindices] 58 traininfected_vec = self.infected_vec_all[trainindices] 59 trainnode_vec = self.node_vec_all[trainindices] 60 traineventmemes = self.eventmemes_all[trainindices] 61 trainW = self.W_all[trainindices, :] 62 trainT = max(trainetimes) 63 trainN = len(trainetimes) 64 65 testtimes = self.etimes_all[testindices] 66 testinfecting_vec = self.infecting_vec_all[testindices] 67 testinfected_vec = self.infected_vec_all[testindices] 68 testnode_vec = self.node_vec_all[testindices] 69 testeventmemes = self.eventmemes_all[testindices] 70 testW = self.W_all[testindices, :] 71 testT = max(testtimes) 72 testN = len(testtimes) 73 74 yield (( 75 trainN, 76 trainetimes, 77 traininfecting_vec, 78 traininfected_vec, 79 trainnode_vec, 80 traineventmemes, 81 trainW, 82 trainT, 83 ), ( 84 testN, 85 testtimes, 86 testinfecting_vec, 87 testinfected_vec, 88 testnode_vec, 89 testeventmemes, 90 testW, 91 testT,
355 @classmethod 356 def training_split(cls, 357 dataset_folder, 358 number_of_validation_examples=500, 359 maximum_disparity=255): 360 """Returns training and validation datasets. 361 362 Example from FlyingThings3d dataset is added to the training 363 or validation datasets if: 364 365 (1) it is training example of FlyingThings3d dataset; 366 (2) it does not have rendering artifacts; 367 (3) all its disparities are within the range [0, maximum_disparity]. 368 369 Args: 370 dataset_folder: folder with FlyingThings3D dataset, that contains 371 "frames_cleanpass" folder with left and right 372 images and "disparity" folder with disparities. 373 number_of_validation_examples: number of examples from training set 374 that will be used for validation. 375 maximum_disparity: maximum disparity in training / validation 376 dataset. All training examples with disparity 377 larger than "maximum_disparity" are excluded 378 from the dataset. 379 """ 380 examples = _find_examples(dataset_folder) 381 # Manual random seed garantees that splits will be same in a 382 # different runs. 383 random.seed(0) 384 random.shuffle(examples) 385 examples = _split_examples_into_training_and_test_sets(examples)[0] 386 examples = _filter_out_examples_with_rendering_artifacts(examples) 387 examples = _filter_out_examples_with_large_disparities( 388 examples, maximum_disparity) 389 _dataset = FlyingThings3D(examples) 390 validation_dataset, training_dataset = _dataset.split_in_two( 391 size_of_first_subset=number_of_validation_examples) 392 return training_dataset, validation_dataset