This Example uses Kaggle Cats&Dogs dataset and convert it to tfrecord fileformat.
import tensorflow as tf
from random import shuffle
import glob
import cv2
import sys
import numpy as np
# 1 List images and their labels
shuffle_data = True
addrs = glob.glob('../../Model/Dog vs Cat/train/*.jpg')
labels = [0 if 'cat' in addr else 1 for addr in addrs] # 0 = Cat, 1 = Dog
# to shuffle data
if shuffle_data:
c = list(zip(addrs, labels)) # zip makes tuple
shuffle(c)
addrs, labels = zip(*c)
# Divide the hata into 60% train, 20% validation, and 20% test
train_rate = 0.6
eval_rate = 0.2
test_rate = 0.2
train_addrs = addrs[0: int(train_rate * len(addrs))]
train_labels = labels[0: int(train_rate * len(labels))]
eval_addrs = addrs[int(train_rate * len(addrs)) : int((train_rate + eval_rate) * len(addrs))]
eval_labels = labels[int(train_rate * len(labels)) : int((train_rate + eval_rate) * len(labels))]
test_addrs = addrs[int((train_rate + eval_rate) * len(addrs)) :]
test_labels = labels[int((train_rate + eval_rate) * len(labels)) :]
def load_image(addr):
# read an image and resize to (224, 224)
# cv2 load images as BGR, convert it to RGB
img = cv2.imread(addr)
img = cv2.resize(img, (224, 224), interpolation=cv2.INTER_CUBIC)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32)
return img
# convert data
def _int64_feature(value):
return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))
def _byte_feature(value):
return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))
dir='./tmp/'
train_filename = 'train.tfrecords'
# open the TFRecords file
writer = tf.python_io.TFRecordWriter(dir+train_filename)
for i in range(len(train_addrs)):
if not i % 1000: # 0 silence but non 0 speak
print('Train data:{}/{}'.format(i, len(train_addrs)))
sys.stdout.flush() # print every step or print all until for loop is done
img = load_image(train_addrs[i])
label = train_labels[i]
# Create a feature
feature = {'train/label': _int64_feature(label),'train/image': _byte_feature(tf.compat.as_bytes(img.tostring()))}
# Create an example protocol buffer
example = tf.train.Example(features = tf.train.Features(feature = feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
eval_filename = 'eval.tfrecords'
writer = tf.python_io.TFRecordWriter(dir+eval_filename)
for i in range(len(eval_addrs)):
if not i % 1000:
print('Eval data:{}/{}'.format(i, len(eval_addrs)))
sys.stdout.flush()
img = load_image(eval_addrs[i])
label = eval_labels[i]
feature = {'eval/label': _int64_feature(value=label),'eval/image':_byte_feature(value=tf.compat.as_bytes(img.tostring()))}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()
test_filename = 'test.tfrecords' # hdf5
writer=tf.python_io.TFRecordWriter(dir+test_filename)
for i in range(len(test_addrs)):
if not i % 1000:
print('Test data:{}/{}'.format(i, len(test_addrs)))
sys.stdout.flush()
img = load_image(test_addrs[i])
label = test_labels[i]
feature = {'test/label': _int64_feature(value=label), 'test/image':_byte_feature(value=tf.compat.as_bytes(img.tostring()))}
example = tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
sys.stdout.flush()