preprocessing
Module for preprocessing
import pydicom,kornia,skimage
from fastai.vision.all import *
from fastai.medical.imaging import *
from torchvision.utils import save_image
from fmi.pipeline import *
from fmi.explore import get_dicom_image
def mask_and_save_path(file: (L), source=None, show=False, window=dicom_windows.lungs, sigma:float=0.1,\
thresh:float=0.9, save=False, save_path=None):
"Helper to create masks based on dicom window with the option to save the updated image from path"
image_list = []
for i in file:
##This line will have to be changed depending on what platform is being used
str_file = str(i); file_name = str_file.split('.')[0].split('\\')[-1] #windows
#str_file = str(i); file_name = str_file.split('/')[-1].split('.')[0] #kaggle
dcm = dcmread(i)
wind = dcm.windowed(*window)
mask = dcm.mask_from_blur(window, sigma=sigma, thresh=thresh, remove_max=False)
bbs = mask2bbox(mask)
lo,hi = bbs
imh = wind[lo[0]:hi[0],lo[1]:hi[1]]
if save is not False:
if not os.path.exists(save_path):
os.makedirs(save_path)
save_image(imh, f'{save_path}/{file_name}.png')
else:
pass
image_list.append(imh)
if show is not False:
show_images(image_list[:10], nrows=1)
else:
pass
def mask_and_save_df(file: (pd.DataFrame), source=None, show=False, window=dicom_windows.lungs, sigma:float=0.1,\
thresh:float=0.9, save=False, save_path=None):
"Helper to create masks based on dicom window with the option to save the updated image from a dataframe"
image_list = []
for i in file.index:
file_path = f"{source}/{file.iloc[i]['PatientID']}/{file.iloc[i]['InstanceNumber']}.dcm"
file_name = file.iloc[i]['InstanceNumber']
dcm = dcmread(file_path)
wind = dcm.windowed(*window)
mask = dcm.mask_from_blur(window, sigma=sigma, thresh=thresh, remove_max=False)
bbs = mask2bbox(mask)
lo,hi = bbs
imh = wind[lo[0]:hi[0],lo[1]:hi[1]]
if save is not False:
if not os.path.exists(save_path):
os.makedirs(save_path)
save_image(imh, f'{save_path}/{file_name}.png')
else:
pass
image_list.append(imh)
if show is not False:
show_images(image_list[:10], nrows=1)
else:
pass
fastai
has a handy method from.dicoms
that can access dicom metadata and display this in a dataframe.
m_items = get_dicom_files('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430')
source = 'D:/Datasets/osic-pulmonary-fibrosis-progression/train/'
dicom_dataframe = pd.DataFrame.from_dicoms(m_items)
dicom_dataframe[:2]
To see how mask_and_save
works here are 10 original images (we can use get_dicom_image
to view the images). To save the images in png
format change save to True
and set a save_path
get_dicom_image(dicom_dataframe[:10], 'PatientID', source=source, nrows=1)
Using a sigma of 0.2
and a lung window
mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.2, save=False)
mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.1, save=False)
mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.01, save=False)
Setting the sigma
value to 0.1 reduces the image area only to the areas that are important
Updated from_dicoms
method to from_dicoms2
that allows you to choose the window setting
@patch
def updated_dict(self:DcmDataset, windows=[dicom_windows.lungs]):
pxdata = (0x7fe0,0x0010)
vals = [self[o] for o in self.keys() if o != pxdata]
its = [(v.keyword, v.value) for v in vals]
res = dict(its)
res['fname'] = self.filename
stats = 'min', 'max', 'mean', 'std'
pxs = self.pixel_array
for f in stats: res['img_'+f] = getattr(pxs, f)()
res['img_pct_window'] = self.pct_in_window(*windows)
res['file_path'] = f'{self.PatientID}/{self.InstanceNumber}.dcm'
return res
def _dcm2dict2(fn, windows, **kwargs): return fn.dcmread().updated_dict(windows, **kwargs)
@delegates(parallel)
def _from_dicoms2(cls, fns, n_workers=0, **kwargs):
return pd.DataFrame(parallel(_dcm2dict2, fns, n_workers=n_workers, **kwargs))
pd.DataFrame.from_dicoms2 = classmethod(_from_dicoms2)
from_dicoms2
allows you to set the dicom window, for example in this case the mediastinum
window is used
dicom_dataframe = pd.DataFrame.from_dicoms2(m_items, windows=dicom_windows.mediastinum)
dicom_dataframe[:2]
In this case the lungs
window was used
dicom_dataframe = pd.DataFrame.from_dicoms2(m_items, windows=dicom_windows.lungs)
dicom_dataframe[:2]
def move_files(df, source, save_path):
"helper to move files"
for i in df.index:
#patient ID
patid = str(df.PatientID[i])
window = str(df.img_pct_window[i])
#fname
filename = str(df.fname[i]).split('/')[-1]
img = filename.split('.')[0]
print(f'ID: {patid} window: {window} instance: {img}')
folder_path = save_path + patid
if not os.path.exists(folder_path):
os.makedirs(folder_path)
img_file = Path(f'{source}/train/{patid}/{img}.dcm')
shutil.copy(img_file, folder_path, follow_symlinks=True)
def dicom_convert_3channel(fn:(Path,str), save_dir:(str), win1=dicom_windows.lungs, \
win2=dicom_windows.liver, win3=dicom_windows.brain):
"Split a dicom image into 3 windows with one window per channel and saved as jpg"
data = dcmread(fn)
file_name = str(fn); name = file_name.split('\\')[-1].split('.')[0]
chan_one = np.expand_dims(data.windowed(*win1), axis=2)
chan_two = np.expand_dims(data.windowed(*win2), axis=2)
chan_three = np.expand_dims(data.windowed(*(win3)), axis=2)
image = np.concatenate([chan_one, chan_two, chan_three], axis=2)
ten_image = TensorImage(image).permute(2,0,1)
save_image(ten_image, f'{save_dir}/{name}.jpg')
To see how dicom_convert_3channel works
, specify a save directory and choose a test file
save_dir = 'D:/Datasets/osic-pulmonary-fibrosis-progression/test3c/'
test1 = m_items[12]
test1
Choose 3 windows, one for each channel, in this case lungs
, mediastinum
and pe
dicom_convert_3channel(test1, save_dir, win1=dicom_windows.lungs, win2=dicom_windows.mediastinum, win3=dicom_windows.pe)
Load the saved image which saves with the same name as the input image name
saved_image = PILImage.create('D:/Datasets/osic-pulmonary-fibrosis-progression/test3c/20.jpg')
saved_ten = TensorImage(saved_image)
saved_ten.shape
show_images([saved_ten[:,:,:], saved_ten[:,:,0], saved_ten[:,:,1], saved_ten[:,:,2]],\
titles=['all_channels', 'lungs', 'medistinum', 'pe'])
def dicomsplit(valid_pct=0.2, seed=None, **kwargs):
"Splits `items` between train/val with `valid_pct`"
"and checks if identical patient IDs exist in both the train and valid sets"
def _inner(o, **kwargs):
train_list = []; valid_list = []
if seed is not None: torch.manual_seed(seed)
rand_idx = L(int(i) for i in torch.randperm(len(o)))
cut = int(valid_pct * len(o))
trn = rand_idx[cut:]; trn_p = o[rand_idx[cut:]]
val = rand_idx[:cut]; val_p = o[rand_idx[:cut]]
train_patient = []; train_images = []
for i, tfile in enumerate(trn_p):
file = dcmread(tfile)
tpat = file.PatientID
train_patient.append(tpat)
file_array = dcmread(tfile).pixel_array
train_images.append(file_array)
val_patient = []; val_images = []
for i, vfile in enumerate(val_p):
file2 = dcmread(vfile)
vpat = file2.PatientID
val_patient.append(vpat)
val_array = dcmread(vfile).pixel_array
val_images.append(val_array)
print(rand_idx)
print(f'Train: {trn}, {train_patient}')
show_images(train_images[:20])
print(f'Val: {val}, {val_patient}')
show_images(val_images[:20])
is_duplicate = set(train_patient) & set(val_patient)
print(f'Duplicate: {set(train_patient) & set(val_patient)}')
new_list = []
if bool(is_duplicate) is not False:
print('duplicate exists')
new_list = [elem for elem in train_patient if elem not in val_patient ]
print(f'New List: {new_list}')
else:
print('duplicate does NOT exist')
new_list = trn
return new_list, val
return _inner
def check_duplicate(items, seed=5):
trn, val = dicomsplit(valid_pct=0.2, seed=seed)(items)
return trn, val
def dicom_splitter(items, valid_pct=0.2, seed=77):
trn, val = dicomsplit(valid_pct=valid_pct)(items)
valid_idx = val
def _inner(o):
train_idx = np.setdiff1d(np.array(range_of(o)), np.array(valid_idx))
print(f'train:{train_idx} val:{valid_idx}')
return L(train_idx, use_list=True), L(valid_idx, use_list=True)
return _inner
Check to see how dicom_splittter
works. First create a random generating function that will choose 10 random numbers between a range of 0 and the length of the number of items
def random_(items, value=10):
randomList = []
for i in range(0,value):
randomList.append(random.randint(0,len(items)))
return items[randomList]
items = get_dicom_files(source)
rand_items = random_(items)
rand_items
check_duplicate
shows the indices of the random 10 images chosen above. By default the train/valid split is 80/20 so the train set has 8 images and the valid set has 2 images. We can view the the 8 images in the train set and the 2 images in the valid set.
set_seed(70)
check_duplicate(rand_items, seed=70)
In the example above there are no duplicates so the train/valid sets remain the same
set_seed(7)
check_duplicate(rand_items, seed=7)