I’ve done an experiment on PANDA dataset that I extracted middle resolution and crop the white area.
Without multiprocessing, the time is 1 hour 30 mins.
With multiprocessing, the time is 30 mins.
Here is the core code
The complete code code is here
from multiprocessing import Pool
import multiprocessing
import numpy as np
import pandas as pd
import time
import skimage.io
from tqdm import tqdm
from PIL import Image
COMP_DIR = '../../../input/prostate-cancer-grade-assessment/'
TRAIN_DIR = COMP_DIR + 'train_images/'
df = pd.read_csv(COMP_DIR + 'train.csv')
df.head()
def crop_white(image: np.ndarray, value: int = 255) -> np.ndarray:
assert image.shape[2] == 3
assert image.dtype == np.uint8
ys, = (image.min((1, 2)) < value).nonzero()
xs, = (image.min(0).min(1) < value).nonzero()
if len(xs) == 0 or len(ys) == 0:
return image
return image[ys.min():ys.max() + 1, xs.min():xs.max() + 1]
def crop_all_img(img_id):
img_path = TRAIN_DIR + img_id + '.tiff'
img = skimage.io.MultiImage(img_path)[1]
# crop
img = crop_white(img)
# save
img = Image.fromarray(img)
img.save(img_id+'.jpg', quality=90)
# plt.imshow(img)
if __name__ == '__main__':
start = time.time()
img_ids = df.image_id.values
pool = Pool(processes=multiprocessing.cpu_count())
pool.map(crop_all_img, img_ids)
delta = time.time() - start
print(f'Used time: {delta}')