Custom masks for sampling specific regions from images with ZarrDataset

import zarrdataset as zds
import zarr

# These are images from the Image Data Resource (IDR) 
# https://idr.openmicroscopy.org/ that are publicly available and were 
# converted to the OME-NGFF (Zarr) format by the OME group. More examples
# can be found at Public OME-Zarr data (Nov. 2020)
# https://www.openmicroscopy.org/2020/11/04/zarr-data.html

filenames = ["https://uk1s3.embassy.ebi.ac.uk/idr/zarr/v0.4/idr0073A/9798462.zarr"]

import random
import numpy as np

# For reproducibility
np.random.seed(478963)
random.seed(478965)

z_img = zarr.open(filenames[0], mode="r")
z_img["0"].info

Name	/0
Type	zarr.core.Array
Data type	uint8
Shape	(1, 3, 1, 16433, 21115)
Chunk shape	(1, 1, 1, 1024, 1024)
Order	C
Read-only	True
Compressor	Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type	zarr.storage.FSStore
No. bytes	1040948385 (992.7M)
Chunks initialized	0/1071

import numpy as np
import matplotlib.pyplot as plt
plt.imshow(np.moveaxis(z_img["4"][0, :, 0], 0, -1))
plt.show()

../_images/efb0e0c981a2a37c61e11c54650023abd22cd8217f80b9af551ca2c50a0ee414.png

Define a mask from where patches can be extracted

from skimage import color, filters, morphology

im_gray = color.rgb2gray(z_img["4"][0, :, 0], channel_axis=0)
thresh = filters.threshold_otsu(im_gray)

mask = im_gray > thresh
mask = morphology.remove_small_objects(mask == 0, min_size=16 ** 2,
                                       connectivity=2)
mask = morphology.remove_small_holes(mask, area_threshold=128)
mask = morphology.binary_erosion(mask, morphology.disk(8))
mask = morphology.binary_dilation(mask, morphology.disk(8))

plt.imshow(mask)
plt.show()

../_images/845f208486df8f8430ae79295cf9999af568961227d3a7f74143891b3feaf295.png

plt.imshow(np.moveaxis(z_img["4"][0, :, 0], 0, -1))
plt.imshow(mask, cmap="gray", alpha=1.0*(mask < 1))
plt.show()

../_images/a29b08481e2598a61febed5711e2d9285586fdc5a6112febb2b8b4f32587e173.png

Extract patches of size 512x512 pixels from a Whole Slide Image (WSI)

Sample the image uniformly in a squared grid pattern

patch_size = dict(Y=512, X=512)
patch_sampler = zds.PatchSampler(patch_size=patch_size)

Use the ZarrDataset class to enable extraction of samples from masked regions.

An extra dimension is added to the mask, so it matches the number of spatial axes in the image

image_specs = zds.ImagesDatasetSpecs(
  filenames=filenames,
  data_group="1",
  source_axes="TCZYX",
)

# Use the MasksDatasetSpecs to add the specifications of the masks.
#masks_specs = zds.MasksDatasetSpecs(
masks_specs = zds.LabelsDatasetSpecs(
  filenames=[mask],
  source_axes="YX",
  axes="ZYX",
  modality="masks",
)

my_dataset = zds.ZarrDataset([image_specs, masks_specs],
                             patch_sampler=patch_sampler)

ds_iterator = iter(my_dataset)

sample = next(ds_iterator)
type(sample[0]), sample[0].shape, sample[0].dtype
type(sample[1]), sample[1].shape, sample[1].dtype

(numpy.ndarray, (1, 64, 65), dtype('bool'))

plt.imshow(np.moveaxis(sample[0][0, :, 0], 0, -1))
plt.show()

../_images/0493ea7949336c8bafa6c2a4a4fb0b6f684533792c8a8aee276b84450c0c67ea.png

plt.imshow(sample[1][0])
plt.show()

../_images/f99c6c24479005965377053093c7c4b95248d8010913dabe394641feeb087266.png

samples = []
labels = []
for i, sample in enumerate(my_dataset):
    samples.append(np.moveaxis(sample[0][0, :, 0], 0, -1))
    labels.append(sample[1][0])

    if i >= 4:
        # Take only five samples for illustration purposes
        break

samples = np.hstack(samples)
labels = np.hstack(labels)

plt.imshow(samples)
plt.show()

../_images/ee452c4ccae35fc77d15bea56dce7c6a644de67d7e546c281e6b1cc08f9c7b2e.png

plt.imshow(labels)
plt.show()

../_images/35caf5d6a16a402da32ca7028c903bf35d4a356a2cc6d94421b48e1413335fa9.png

Use a function to generate the masks for each image in the dataset

Get only patches that are covered by at least 1/16th of their area by the mask

patch_size = dict(Y=512, X=512)
patch_sampler = zds.PatchSampler(patch_size=patch_size, min_area=1/16)

Apply WSITissueMaskGenerator transform to each image in the dataset to define each sampling mask

mask_func = zds.WSITissueMaskGenerator(mask_scale=1,
                                       min_size=16,
                                       area_threshold=128,
                                       axes="ZYX")

Because the input image (zarr group “1”) is large, computing the mask directly on that could require high computational resources.

For that reason, use a donwsampled version of that image instead by pointing mask_data_group="4" to use a 1:16 downsampled version of the input image.

The mask_axes should match the ones that WSITissueMaskGenerator requies as input (“YXC”). To do that, a ROI can be specified to take just the spatial and channel axes from the input image with mask_roi="(0,0,0,0,0):(1,-1,1,-1,-1)", and rearrange the output axes with mask_axes="YXC".

image_specs = zds.ImagesDatasetSpecs(
  filenames=filenames,
  data_group="1",
  source_axes="TCZYX",
)

# Use the MasksDatasetSpecs to add the specifications of the masks.
# The mask generation function is added as `image_loader_func` parameter of the dataset specification for masks.
masks_specs = zds.MasksDatasetSpecs(
  filenames=filenames,
  data_group="4",
  source_axes="TCZYX",
  axes="YXC",
  roi="(0,0,0,0,0):(1,-1,1,-1,-1)",
  image_loader_func=mask_func,
)

my_dataset = zds.ZarrDataset([image_specs, masks_specs],
                             patch_sampler=patch_sampler,
                             shuffle=True)

samples = []

for i, sample in enumerate(my_dataset):
    samples.append(np.moveaxis(sample[0, :, 0], 0, -1))

    if i >= 4:
        # Take only five samples for illustration purposes
        break

samples = np.hstack(samples)

plt.imshow(samples)
plt.show()

../_images/4e0e23bdb35ff463a282b0662c51ebe482fdeb0fb21e017109fefe9fe682fff5.png