MNIST/preprocess.py at main · cli1903/MNIST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gzip
import numpy as np

def get_data(inputs_file_path, labels_file_path, num_examples):
	"""
	Takes in an inputs file path and labels file path, unzips both files,
	normalizes the inputs, and returns (NumPy array of inputs, NumPy
	array of labels). Read the data of the file into a buffer and use
	np.frombuffer to turn the data into a NumPy array. Keep in mind that
	each file has a header of a certain size. This method should be called
	within the main function of the model.py file to get BOTH the train and
	test data. If you change this method and/or write up separate methods for
	both train and test data, we will deduct points.
	:param inputs_file_path: file path for inputs, something like
	'MNIST_data/t10k-images-idx3-ubyte.gz'
	:param labels_file_path: file path for labels, something like
	'MNIST_data/t10k-labels-idx1-ubyte.gz'
	:param num_examples: used to read from the bytestream into a buffer. Rather
	than hardcoding a number to read from the bytestream, keep in mind that each image
	(example) is 28 * 28, with a header of a certain number.
	:return: NumPy array of inputs as float32 and labels as int8
	"""

	#TODO: Load inputs and labels
	#TODO: Normalize inputs

	with open(inputs_file_path, 'rb') as tFile, gzip.GzipFile(fileobj = tFile) as tBytes:
		with open(labels_file_path, 'rb') as lFile, gzip.GzipFile(fileobj = lFile) as lBytes:
			tBytes.read(16)
			trainData = tBytes.read(784 * num_examples)

			lBytes.read(8)
			labelData = lBytes.read(num_examples)

			trainArray = np.frombuffer(trainData, dtype = np.uint8)
			trainArray = trainArray / 255.0
			trainArray = np.reshape(trainArray, [num_examples, 784])

			return trainArray, np.frombuffer(labelData, dtype = np.uint8)