Home > AI > Uncategorized

TensorFlow – TfRecord1

This Example uses Kaggle Cats&Dogs dataset and convert it to tfrecord fileformat.


import tensorflow as tf
from random import shuffle
import glob
import cv2
import sys
import numpy as np

# 1 List images and their labels
shuffle_data = True

addrs = glob.glob('../../Model/Dog vs Cat/train/*.jpg')
labels = [0 if 'cat' in addr else 1 for addr in addrs] # 0 = Cat, 1 = Dog

# to shuffle data
if shuffle_data:
    c = list(zip(addrs, labels)) # zip makes tuple
    shuffle(c)
    addrs, labels = zip(*c)

# Divide the hata into 60% train, 20% validation, and 20% test
train_rate = 0.6
eval_rate = 0.2
test_rate = 0.2
train_addrs = addrs[0: int(train_rate * len(addrs))]
train_labels = labels[0: int(train_rate * len(labels))]

eval_addrs = addrs[int(train_rate * len(addrs)) : int((train_rate + eval_rate) * len(addrs))]
eval_labels = labels[int(train_rate * len(labels)) : int((train_rate + eval_rate) * len(labels))]

test_addrs = addrs[int((train_rate + eval_rate) * len(addrs)) :]
test_labels = labels[int((train_rate + eval_rate) * len(labels)) :]

def load_image(addr):
    # read an image and resize to (224, 224)
    # cv2 load images as BGR, convert it to RGB

    img = cv2.imread(addr)
    img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32)
    return img

# convert data
def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))

def _byte_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))


dir='./tmp/'
train_filename = 'train.tfrecords'
# open the TFRecords file
writer = tf.python_io.TFRecordWriter(dir+train_filename)
for i in range(len(train_addrs)):
    if not i % 1000: # 0 silence but non 0 speak
        print('Train data:{}/{}'.format(i, len(train_addrs)))
        sys.stdout.flush() # print every step or print all until for loop is done

    img = load_image(train_addrs[i])
    label = train_labels[i]

    # Create a feature
    feature = {'train/label': _int64_feature(label),'train/image': _byte_feature(tf.compat.as_bytes(img.tostring()))}

    # Create an example protocol buffer
    example = tf.train.Example(features = tf.train.Features(feature = feature))

    # Serialize to string and write on the file
    writer.write(example.SerializeToString())

writer.close()
sys.stdout.flush()




eval_filename = 'eval.tfrecords'
writer = tf.python_io.TFRecordWriter(dir+eval_filename)
for i in range(len(eval_addrs)):
    if not i % 1000:
        print('Eval data:{}/{}'.format(i, len(eval_addrs)))
        sys.stdout.flush()
    
    img = load_image(eval_addrs[i])
    label = eval_labels[i]

    feature = {'eval/label': _int64_feature(value=label),'eval/image':_byte_feature(value=tf.compat.as_bytes(img.tostring()))}
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    writer.write(example.SerializeToString())

writer.close()
sys.stdout.flush()




test_filename = 'test.tfrecords' # hdf5
writer=tf.python_io.TFRecordWriter(dir+test_filename)
for i in range(len(test_addrs)):
    if not i % 1000:
        print('Test data:{}/{}'.format(i, len(test_addrs)))
        sys.stdout.flush()

        img = load_image(test_addrs[i])
        label = test_labels[i]

        feature = {'test/label': _int64_feature(value=label), 'test/image':_byte_feature(value=tf.compat.as_bytes(img.tostring()))}
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())

writer.close()
sys.stdout.flush()

 

Related posts:

Leave a Reply