import pydicom,kornia,skimage
from fastai.vision.all import *
from fastai.medical.imaging import *
from torchvision.utils import save_image

from fmi.pipeline import *
from fmi.explore import get_dicom_image

Mask & Save

Note: View Mask & Save tutorial on Kaggle (click on Kaggle icon)
kaggle
def mask_and_save_path(file: (L), source=None, show=False, window=dicom_windows.lungs, sigma:float=0.1,\
                  thresh:float=0.9, save=False, save_path=None):
    "Helper to create masks based on dicom window with the option to save the updated image from path"
    image_list = []
    for i in file:
        ##This line will have to be changed depending on what platform is being used
        str_file = str(i); file_name = str_file.split('.')[0].split('\\')[-1] #windows
        #str_file = str(i); file_name = str_file.split('/')[-1].split('.')[0] #kaggle
        dcm = dcmread(i)
        wind = dcm.windowed(*window)
        mask = dcm.mask_from_blur(window, sigma=sigma, thresh=thresh, remove_max=False)
        bbs = mask2bbox(mask)
        lo,hi = bbs
        imh = wind[lo[0]:hi[0],lo[1]:hi[1]]
        if save is not False:
            if not os.path.exists(save_path):
                os.makedirs(save_path)   
            save_image(imh, f'{save_path}/{file_name}.png')
        else:
            pass
        image_list.append(imh)
    if show is not False:
        show_images(image_list[:10], nrows=1)
    else:
        pass
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1c40db12482e> in <module>
----> 1 show_doc(mask_and_save_path)

NameError: name 'show_doc' is not defined
def mask_and_save_df(file: (pd.DataFrame), source=None, show=False, window=dicom_windows.lungs, sigma:float=0.1,\
                  thresh:float=0.9, save=False, save_path=None):
    "Helper to create masks based on dicom window with the option to save the updated image from a dataframe"
    image_list = []
    for i in file.index:
        file_path = f"{source}/{file.iloc[i]['PatientID']}/{file.iloc[i]['InstanceNumber']}.dcm"
        file_name = file.iloc[i]['InstanceNumber']
        dcm = dcmread(file_path)
        wind = dcm.windowed(*window)
        mask = dcm.mask_from_blur(window, sigma=sigma, thresh=thresh, remove_max=False)
        bbs = mask2bbox(mask)
        lo,hi = bbs
        imh = wind[lo[0]:hi[0],lo[1]:hi[1]]
        if save is not False:
            if not os.path.exists(save_path):
                os.makedirs(save_path)   
            save_image(imh, f'{save_path}/{file_name}.png')
        else:
            pass
        image_list.append(imh)
    if show is not False:
        show_images(image_list[:10], nrows=1)
    else:
        pass

mask_and_save_df[source]

mask_and_save_df(file:DataFrame, source=None, show=False, window=(1500, -600), sigma:float=0.1, thresh:float=0.9, save=False, save_path=None)

Helper to create masks based on dicom window with the option to save the updated image from a dataframe

fastai has a handy method from.dicoms that can access dicom metadata and display this in a dataframe.

m_items = get_dicom_files('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430')
source = 'D:/Datasets/osic-pulmonary-fibrosis-progression/train/'
dicom_dataframe = pd.DataFrame.from_dicoms(m_items)
dicom_dataframe[:2]
SpecificCharacterSet ImageType SOPInstanceUID Modality Manufacturer ManufacturerModelName PatientName PatientID PatientSex DeidentificationMethod ... ImageOrientationPatient3 ImageOrientationPatient4 ImageOrientationPatient5 MultiPixelSpacing PixelSpacing1 img_min img_max img_mean img_std img_pct_window
0 ISO_IR 100 ORIGINAL 2.25.12297650151329871895440507938349160734 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... 0.0 1.0 0.0 1 0.652344 -2000 2842 -1.454884 1137.488858 0.058094
1 ISO_IR 100 ORIGINAL 2.25.37611372879908126511187998276199853341 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... 0.0 1.0 0.0 1 0.652344 -2000 2918 19.038597 1138.876560 0.068130

2 rows × 67 columns

To see how mask_and_save works here are 10 original images (we can use get_dicom_image to view the images). To save the images in png format change save to True and set a save_path

get_dicom_image(dicom_dataframe[:10], 'PatientID', source=source, nrows=1)

Using a sigma of 0.2 and a lung window

mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.2, save=False)
mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.1, save=False)
mask_and_save_df(dicom_dataframe[:10], window=dicom_windows.lungs, show=True, source=source, sigma=0.01, save=False)

Setting the sigma value to 0.1 reduces the image area only to the areas that are important

Dicom metadata dict

Updated from_dicoms method to from_dicoms2 that allows you to choose the window setting

@patch
def updated_dict(self:DcmDataset, windows=[dicom_windows.lungs]):
    pxdata = (0x7fe0,0x0010)
    vals = [self[o] for o in self.keys() if o != pxdata]
    its = [(v.keyword, v.value) for v in vals]
    res = dict(its)
    res['fname'] = self.filename
    
    stats = 'min', 'max', 'mean', 'std'
    pxs = self.pixel_array
    for f in stats: res['img_'+f] = getattr(pxs, f)()
    res['img_pct_window'] = self.pct_in_window(*windows)
    res['file_path'] = f'{self.PatientID}/{self.InstanceNumber}.dcm'
    return res
def _dcm2dict2(fn, windows, **kwargs): return fn.dcmread().updated_dict(windows, **kwargs)
@delegates(parallel)
def _from_dicoms2(cls, fns, n_workers=0, **kwargs):
    return pd.DataFrame(parallel(_dcm2dict2, fns, n_workers=n_workers, **kwargs))
pd.DataFrame.from_dicoms2 = classmethod(_from_dicoms2)

from_dicoms2 allows you to set the dicom window, for example in this case the mediastinum window is used

dicom_dataframe = pd.DataFrame.from_dicoms2(m_items, windows=dicom_windows.mediastinum)
dicom_dataframe[:2]
SpecificCharacterSet ImageType SOPInstanceUID Modality Manufacturer ManufacturerModelName PatientName PatientID PatientSex DeidentificationMethod ... RescaleIntercept RescaleSlope RescaleType fname img_min img_max img_mean img_std img_pct_window file_path
0 ISO_IR 100 [ORIGINAL, PRIMARY, AXIAL] 2.25.12297650151329871895440507938349160734 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... -1024.0 1.0 HU D:\Datasets\osic-pulmonary-fibrosis-progression\train\ID00007637202177411956430\1.dcm -2000 2842 -1.454884 1137.488858 0.219498 ID00007637202177411956430/1.dcm
1 ISO_IR 100 [ORIGINAL, PRIMARY, AXIAL] 2.25.37611372879908126511187998276199853341 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... -1024.0 1.0 HU D:\Datasets\osic-pulmonary-fibrosis-progression\train\ID00007637202177411956430\10.dcm -2000 2918 19.038597 1138.876560 0.248634 ID00007637202177411956430/10.dcm

2 rows × 54 columns

In this case the lungs window was used

dicom_dataframe = pd.DataFrame.from_dicoms2(m_items, windows=dicom_windows.lungs)
dicom_dataframe[:2]
SpecificCharacterSet ImageType SOPInstanceUID Modality Manufacturer ManufacturerModelName PatientName PatientID PatientSex DeidentificationMethod ... RescaleIntercept RescaleSlope RescaleType fname img_min img_max img_mean img_std img_pct_window file_path
0 ISO_IR 100 [ORIGINAL, PRIMARY, AXIAL] 2.25.12297650151329871895440507938349160734 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... -1024.0 1.0 HU D:\Datasets\osic-pulmonary-fibrosis-progression\train\ID00007637202177411956430\1.dcm -2000 2842 -1.454884 1137.488858 0.702209 ID00007637202177411956430/1.dcm
1 ISO_IR 100 [ORIGINAL, PRIMARY, AXIAL] 2.25.37611372879908126511187998276199853341 CT GE MEDICAL SYSTEMS LightSpeed VCT (I, D, 0, 0, 0, 0, 7, 6, 3, 7, 2, 0, 2, 1, 7, 7, 4, 1, 1, 9, 5, 6, 4, 3, 0) ID00007637202177411956430 Table; ... -1024.0 1.0 HU D:\Datasets\osic-pulmonary-fibrosis-progression\train\ID00007637202177411956430\10.dcm -2000 2918 19.038597 1138.876560 0.720448 ID00007637202177411956430/10.dcm

2 rows × 54 columns

Move Files

Note: View dicom dataframe and move_files tutorial on Kaggle (click on Kaggle icon)
kaggle
def move_files(df, source, save_path):
    "helper to move files"
    for i in df.index:
        #patient ID
        patid = str(df.PatientID[i])
        window = str(df.img_pct_window[i])
        
        #fname
        filename = str(df.fname[i]).split('/')[-1]
        img = filename.split('.')[0]
        print(f'ID: {patid} window: {window} instance: {img}')
        
        folder_path = save_path + patid
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)   
        img_file = Path(f'{source}/train/{patid}/{img}.dcm')
        shutil.copy(img_file, folder_path, follow_symlinks=True)

move_files[source]

move_files(df, source, save_path)

helper to move files

dicom convert 3channel

def dicom_convert_3channel(fn:(Path,str), save_dir:(str), win1=dicom_windows.lungs, \
                           win2=dicom_windows.liver, win3=dicom_windows.brain):
    "Split a dicom image into 3 windows with one window per channel and saved as jpg"
    data = dcmread(fn)
    file_name = str(fn); name = file_name.split('\\')[-1].split('.')[0]
        
    chan_one = np.expand_dims(data.windowed(*win1), axis=2)
    chan_two = np.expand_dims(data.windowed(*win2), axis=2)
    chan_three = np.expand_dims(data.windowed(*(win3)), axis=2)
    image = np.concatenate([chan_one, chan_two, chan_three], axis=2)
    ten_image = TensorImage(image).permute(2,0,1)
    save_image(ten_image, f'{save_dir}/{name}.jpg')

dicom_convert_3channel[source]

dicom_convert_3channel(fn:Path'>, <class 'str'>), save_dir:str, win1=(1500, -600), win2=(150, 30), win3=(80, 40))

Split a dicom image into 3 windows with one window per channel and saved as jpg

To see how dicom_convert_3channel works, specify a save directory and choose a test file

save_dir = 'D:/Datasets/osic-pulmonary-fibrosis-progression/test3c/'
test1 = m_items[12]
test1
Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/20.dcm')

Choose 3 windows, one for each channel, in this case lungs, mediastinum and pe

dicom_convert_3channel(test1, save_dir, win1=dicom_windows.lungs, win2=dicom_windows.mediastinum, win3=dicom_windows.pe)

Load the saved image which saves with the same name as the input image name

saved_image = PILImage.create('D:/Datasets/osic-pulmonary-fibrosis-progression/test3c/20.jpg')
saved_ten = TensorImage(saved_image)
saved_ten.shape
torch.Size([512, 512, 3])
show_images([saved_ten[:,:,:], saved_ten[:,:,0], saved_ten[:,:,1], saved_ten[:,:,2]],\
            titles=['all_channels', 'lungs', 'medistinum', 'pe'])

Dicom Splitter

Note: View dicom splitter tutorial on kaggle (click Kaggle icon)
kaggle
def dicomsplit(valid_pct=0.2, seed=None, **kwargs):
    "Splits `items` between train/val with `valid_pct`"
    "and checks if identical patient IDs exist in both the train and valid sets"
    def _inner(o, **kwargs):
        train_list = []; valid_list = []
        if seed is not None: torch.manual_seed(seed)
        rand_idx = L(int(i) for i in torch.randperm(len(o)))
        cut = int(valid_pct * len(o))
        trn = rand_idx[cut:]; trn_p = o[rand_idx[cut:]]
        val = rand_idx[:cut]; val_p = o[rand_idx[:cut]]
        train_patient = []; train_images = []
        for i, tfile in enumerate(trn_p):
            file = dcmread(tfile)
            tpat = file.PatientID
            train_patient.append(tpat)
            file_array = dcmread(tfile).pixel_array
            train_images.append(file_array)
        val_patient = []; val_images = []
        for i, vfile in enumerate(val_p):
            file2 = dcmread(vfile)
            vpat = file2.PatientID
            val_patient.append(vpat)
            val_array = dcmread(vfile).pixel_array
            val_images.append(val_array)
    
        print(rand_idx)
        print(f'Train: {trn}, {train_patient}')
        show_images(train_images[:20])
        print(f'Val: {val}, {val_patient}')
        show_images(val_images[:20])
        is_duplicate = set(train_patient) & set(val_patient)
        print(f'Duplicate: {set(train_patient) & set(val_patient)}')
        new_list = []
        if bool(is_duplicate) is not False:
            print('duplicate exists')
            new_list = [elem for elem in train_patient if elem not in val_patient ]
            print(f'New List: {new_list}')
        else:
            print('duplicate does NOT exist')
            new_list = trn
        return new_list, val
    return _inner

dicomsplit[source]

dicomsplit(valid_pct=0.2, seed=None, **kwargs)

Splits items between train/val with valid_pct

def check_duplicate(items, seed=5):
    trn, val = dicomsplit(valid_pct=0.2, seed=seed)(items)
    return trn, val
def dicom_splitter(items, valid_pct=0.2, seed=77):
    trn, val = dicomsplit(valid_pct=valid_pct)(items)
    valid_idx = val
    def _inner(o):
        train_idx = np.setdiff1d(np.array(range_of(o)), np.array(valid_idx))
        print(f'train:{train_idx} val:{valid_idx}')
        return L(train_idx, use_list=True), L(valid_idx, use_list=True)
    return _inner

dicom_splitter[source]

dicom_splitter(items, valid_pct=0.2, seed=77)

Check to see how dicom_splittter works. First create a random generating function that will choose 10 random numbers between a range of 0 and the length of the number of items

def random_(items, value=10):
    randomList = []
    for i in range(0,value):
        randomList.append(random.randint(0,len(items)))
    return items[randomList]
items = get_dicom_files(source)
rand_items = random_(items)
rand_items
(#10) [Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00173637202238329754031/207.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00020637202178344345685/119.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00136637202224951350618/161.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00317637202283194142136/178.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00378637202298597306391/172.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00020637202178344345685/70.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00343637202287577133798/34.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00196637202246668775836/36.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00075637202198610425520/148.dcm'),Path('D:/Datasets/osic-pulmonary-fibrosis-progression/train/ID00111637202210956877205/171.dcm')]

check_duplicate shows the indices of the random 10 images chosen above. By default the train/valid split is 80/20 so the train set has 8 images and the valid set has 2 images. We can view the the 8 images in the train set and the 2 images in the valid set.

set_seed(70)
check_duplicate(rand_items, seed=70)
[4, 9, 0, 7, 2, 1, 6, 3, 5, 8]
Train: [0, 7, 2, 1, 6, 3, 5, 8], ['ID00173637202238329754031', 'ID00196637202246668775836', 'ID00136637202224951350618', 'ID00020637202178344345685', 'ID00343637202287577133798', 'ID00317637202283194142136', 'ID00020637202178344345685', 'ID00075637202198610425520']
Val: [4, 9], ['ID00378637202298597306391', 'ID00111637202210956877205']
Duplicate: set()
duplicate does NOT exist
((#8) [0,7,2,1,6,3,5,8], (#2) [4,9])

In the example above there are no duplicates so the train/valid sets remain the same

set_seed(7)
check_duplicate(rand_items, seed=7)
[5, 0, 3, 4, 1, 7, 9, 6, 8, 2]
Train: [3, 4, 1, 7, 9, 6, 8, 2], ['ID00317637202283194142136', 'ID00378637202298597306391', 'ID00020637202178344345685', 'ID00196637202246668775836', 'ID00111637202210956877205', 'ID00343637202287577133798', 'ID00075637202198610425520', 'ID00136637202224951350618']
Val: [5, 0], ['ID00020637202178344345685', 'ID00173637202238329754031']
Duplicate: {'ID00020637202178344345685'}
duplicate exists
New List: ['ID00317637202283194142136', 'ID00378637202298597306391', 'ID00196637202246668775836', 'ID00111637202210956877205', 'ID00343637202287577133798', 'ID00075637202198610425520', 'ID00136637202224951350618']
(['ID00317637202283194142136',
  'ID00378637202298597306391',
  'ID00196637202246668775836',
  'ID00111637202210956877205',
  'ID00343637202287577133798',
  'ID00075637202198610425520',
  'ID00136637202224951350618'],
 (#2) [5,0])