# ========== DATA NOTES ========== # # # Number of "Sample Images" patients = 20 # Number of "Training" patients = 1397 # Each patient is associated with between, approximately, 100 and 400 DICOM images # The cancer rate within the training set is 25.91% # import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os, sys import glob import datetime import dicom import scipy.ndimage from skimage import measure, morphology from sklearn.metrics import log_loss from mpl_toolkits.mplot3d.art3d import Poly3DCollection os.chdir('C://Users//Jay//Desktop//DSB2017') # ========== CREATE FUNCTIONS USED IN CODE ========== # def dicom_to_image(filename): dcm = dicom.read_file(filename) img = dcm.pixel_array img[img == -2000] = 0 return img def get_slice_location(dcm): return float(dcm[0x0020, 0x1041].value) # Returns a list of images for that patient_id, in ascending order of Slice Location def load_patient(patient_id): files = glob.glob('Original_Data//sample_images//{}//*.dcm'.format(patient_id)) imgs = {} for f in files: dcm = dicom.read_file(f) img = dcm.pixel_array img[img == -2000] = 0 sl = get_slice_location(dcm) imgs[sl] = img sorted_imgs = [x[1] for x in sorted(imgs.items(), key=lambda x: x[0])] return sorted_imgs # This function takes in a single frame from the DICOM and returns a single frame in RGB format. def normalise(img): normed = (img / 14).astype(np.uint8) # Magic number, scaling to create int between 0 and 255 img2 = np.zeros([*img.shape, 3], dtype=np.uint8) for i in range(3): img2[:, :, i] = normed return img2 # ========== INVESTIGATE SAMPLE IMAGES ========== # for d in os.listdir("Original_Data//sample_images"): print("Patient '{}' has {} scans".format(d, len(os.listdir("Original_Data//sample_images//" + d)))) print('----') print('Total number of patients: {}\nTotal number of DCM files: {}'.format(len(os.listdir("Original_Data//sample_images")), len(glob.glob("Original_Data//sample_images//*//*.dcm")))) patient_sizes = [len(os.listdir('Original_Data//sample_images//' + d)) for d in os.listdir('Original_Data//sample_images')] plt.hist(patient_sizes) plt.ylabel('Number of patients') plt.xlabel('DICOM files') plt.title('Histogram of DICOM count per patient') sizes = [os.path.getsize(dcm)/1000000 for dcm in glob.glob("Original_Data//sample_images//*//*.dcm")] print('DCM file sizes: min {:.3}MB, max {:.3}MB, avg {:.3}MB, std {:.3}MB'.format(np.min(sizes), np.max(sizes), np.mean(sizes), np.std(sizes))) # ========== INVESTIGATE TRAINING SET ========== # df_train = pd.read_csv('Original_Data//stage1_labels//stage1_labels.csv') df_train.head() print('Number of training patients: {}'.format(len(df_train))) print('Cancer rate: {:.4}%'.format(df_train.cancer.mean()*100)) # ========== CREATE NAIVE SUBMISSION ========== # logloss = log_loss(df_train.cancer, np.zeros_like(df_train.cancer) + df_train.cancer.mean()) print('Training logloss is {}'.format(logloss)) sample = pd.read_csv('Original_Data//stage1_sample_submission//stage1_sample_submission.csv') sample['cancer'] = df_train.cancer.mean() sample.to_csv('Submissions//naive_submission_'+datetime.datetime.now().strftime("%Y%m%d%H%M")+'.csv', index=False) # ========== INVESTIGATE IMAGE PROPERTIES ========== # patientID='0a38e7597ca26f9374f8ea2770ba870d' imageID='4ec5ef19b52ec06a819181e404d37038' dcmFile = 'Original_Data//sample_images//'+patientID+'//'+imageID+'.dcm' print('Filename: {}'.format(dcmFile)) dcmFile = dicom.read_file(dcmFile) dcmFile img = dcmFile.pixel_array img[img == -2000] = 0 plt.axis('off') plt.imshow(img) plt.show() plt.axis('off') plt.imshow(-img) # Invert colors plt.show() files = glob.glob('Original_Data/sample_images/*/*.dcm') f, plots = plt.subplots(4, 5, sharex='col', sharey='row', figsize=(10, 8)) for i in range(20): plots[i // 5, i % 5].axis('off') plots[i // 5, i % 5].imshow(dicom_to_image(np.random.choice(files)), cmap=plt.cm.bone) pat = load_patient(patientID) f, plots = plt.subplots(11, 10, sharex='all', sharey='all', figsize=(10, 11)) plt.title('Sorted Slices of Patient '+patientID) for i in range(110): plots[i // 10, i % 10].axis('off') plots[i // 10, i % 10].imshow(pat[i], cmap=plt.cm.bone) npat = [normalise(p) for p in pat]