-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathdatasets.py
More file actions
125 lines (90 loc) · 4.75 KB
/
datasets.py
File metadata and controls
125 lines (90 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import config
import theano
import theano.tensor as T
import numpy as np
import struct
import os
import scipy.io
class DatasetTheano():
def __init__(self, train_data, test_data, n_used_for_validation, shuffle=False, shuffle_seed=123):
self.data = {}
if shuffle:
permutation = np.random.RandomState(seed=shuffle_seed).permutation(train_data.shape[0])
train_data = train_data[permutation]
self.data['train'] = train_data[:-n_used_for_validation]
self.data['validation'] = train_data[-n_used_for_validation:]
self.data['test'] = test_data
for subdataset in ['train', 'validation', 'test']:
self.data[subdataset] = theano.shared(value=self.data[subdataset].astype(theano.config.floatX))
def minibatchIindex_minibatch_size(self, index, minibatch_size, subdataset='train', **kwargs):
return self.data[subdataset][index*minibatch_size: (index+1)*minibatch_size]
def get_data_dim(self):
return self.data['train'].get_value(borrow=True).shape[1]
def get_n_examples(self, subdataset):
return self.data[subdataset].get_value(borrow=True).shape[0]
def get_train_bias_np(self):
return -np.log(1./np.clip(self.get_train_mean_np(), 0.001, 0.999)-1.)\
.astype(theano.config.floatX)
def get_train_mean_np(self):
return np.mean(self.data['train'].get_value(), axis=0)[None, :].astype(theano.config.floatX)
class BinarizedDatasetTheano():
def __init__(self, dataset):
self.data = {}
for subdataset in dataset.data:
self.data[subdataset] = dataset.data[subdataset]
def minibatchIindex_minibatch_size(self, index, minibatch_size, srng, subdataset):
data = self.data[subdataset][index*minibatch_size: (index+1)*minibatch_size]
binary_data = T.cast(T.le(srng.uniform(data.shape), data), data.dtype)
return binary_data
def get_data_dim(self):
return self.data['train'].get_value(borrow=True).shape[1]
def get_n_examples(self, subdataset):
return self.data[subdataset].get_value(borrow=True).shape[0]
def get_train_bias_np(self):
return -np.log(1./np.clip(self.get_train_mean_np(), 0.001, 0.999)-1.)\
.astype(theano.config.floatX)
def get_train_mean_np(self):
return np.mean(self.data['train'].get_value(), axis=0)[None, :].astype(theano.config.floatX)
def binarized_mnist(n_validation=400):
def load_mnist_images_np(imgs_filename):
with open(imgs_filename, 'rb') as f:
f.seek(4)
nimages, rows, cols = struct.unpack('>iii', f.read(12))
dim = rows*cols
images = np.fromfile(f, dtype=np.dtype(np.ubyte))
images = (images/255.0).astype('float32').reshape((nimages, dim))
return images
train_data = load_mnist_images_np(
os.path.join(config.DATASETS_DIR, 'MNIST', 'train-images-idx3-ubyte'))
test_data = load_mnist_images_np(
os.path.join(config.DATASETS_DIR, 'MNIST', 't10k-images-idx3-ubyte'))
return BinarizedDatasetTheano(DatasetTheano(train_data, test_data, n_validation, shuffle=False))
def binarized_shuffled_omniglot(n_validation=1345):
def reshape_data(data):
return data.reshape((-1, 28, 28)).reshape((-1, 28*28), order='fortran')
omni_raw = scipy.io.loadmat(
os.path.join(config.DATASETS_DIR, 'OMNIGLOT', 'chardata.mat'))
train_data = reshape_data(omni_raw['data'].T.astype('float32'))
test_data = reshape_data(omni_raw['testdata'].T.astype('float32'))
return BinarizedDatasetTheano(DatasetTheano(train_data, test_data, n_validation, shuffle=True))
def binarized_mnist_fixed_binarization():
def lines_to_np_array(lines):
return np.array([[int(i) for i in line.split()] for line in lines])
with open(os.path.join(config.DATASETS_DIR, 'BinaryMNIST', 'binarized_mnist_train.amat')) as f:
lines = f.readlines()
train_data = lines_to_np_array(lines).astype('float32')
with open(os.path.join(config.DATASETS_DIR, 'BinaryMNIST', 'binarized_mnist_valid.amat')) as f:
lines = f.readlines()
validation_data = lines_to_np_array(lines).astype('float32')
with open(os.path.join(config.DATASETS_DIR, 'BinaryMNIST', 'binarized_mnist_test.amat')) as f:
lines = f.readlines()
test_data = lines_to_np_array(lines).astype('float32')
return DatasetTheano(np.concatenate([train_data, validation_data], axis=0), test_data, n_used_for_validation=10000, shuffle=False)
def load_dataset_from_name(dataset_name):
if dataset_name == 'MNIST':
dataset = binarized_mnist()
elif dataset_name == 'BinFixMNIST':
dataset = binarized_mnist_fixed_binarization()
elif dataset_name == 'OMNI':
dataset = binarized_shuffled_omniglot()
return dataset