CNN/preprocess.py at main · cli1903/CNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pickle
import numpy as np
import tensorflow as tf
import os

def unpickle(file):
    """
    CIFAR data contains the files data_batch_1, data_batch_2, ...,
    as well as test_batch. We have combined all train batches into one
    batch for you. Each of these files is a Python "pickled"
    object produced with cPickle. The code below will open up a
    "pickled" object (each file) and return a dictionary.

    NOTE: DO NOT EDIT

    :param file: the file to unpickle
    :return: dictionary of unpickled data
    """
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


def get_data(file_path, first_class, second_class):
    """
    Given a file path and two target classes, returns an array of
    normalized inputs (images) and an array of labels.
    You will want to first extract only the data that matches the
    corresponding classes we want (there are 10 classes and we only want 2).
    You should make sure to normalize all inputs and also turn the labels
    into one hot vectors using tf.one_hot().
    Note that because you are using tf.one_hot() for your labels, your
    labels will be a Tensor, while your inputs will be a NumPy array. This
    is fine because TensorFlow works with NumPy arrays.
    :param file_path: file path for inputs and labels, something
    like 'CIFAR_data_compressed/train'
    :param first_class:  an integer (0-9) representing the first target
    class in the CIFAR10 dataset, for a cat, this would be a 3
    :param first_class:  an integer (0-9) representing the second target
    class in the CIFAR10 dataset, for a dog, this would be a 5
    :return: normalized NumPy array of inputs and tensor of labels, where
    inputs are of type np.float32 and has size (num_inputs, width, height, num_channels) and labels
    has size (num_examples, num_classes)
    """
    unpickled_file = unpickle(file_path)
    inputs = unpickled_file[b'data']
    labels = unpickled_file[b'labels']

    # TODO: Do the rest of preprocessing!
    filtered_l = []
    binary = []
    for i in range(len(labels)):
        if labels[i] == first_class:
            filtered_l.append(i)
            binary.append(0)
        elif labels[i] == second_class:
            filtered_l.append(i)
            binary.append(1)

    inputs = tf.gather(inputs, filtered_l)
    inputs = tf.cast(inputs, tf.float32) / 255.0
    inputs = np.transpose(tf.reshape(inputs, [-1, 3, 32, 32]), [0, 2, 3, 1])


    labels = tf.gather(labels, filtered_l)

    labels = tf.one_hot(binary, depth = 2)


    return inputs, labels