Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Predict-Lung-Disease #9

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions Predict-Lung-Disease-master/000_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import sys, os
import azure_chestxray_utils
import pickle
import random
import re
import tqdm
import cv2
import numpy as np
import pandas as pd
import sklearn.model_selection
from collections import Counter

paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]
def add_path_to_sys_path(path_to_append):
if not (any(path_to_append in paths for paths in sys.path)):
sys.path.append(path_to_append)

[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]

path= os.getcwd()+r'\azure-share'
isExists=os.path.exists(path)
if not isExists:
amlWBSharedDir = os.mkdir(path)
else:
amlWBSharedDir = path




prj_consts = azure_chestxray_utils.chestxray_consts()
print(prj_consts)

data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))

isExists1 = os.path.exists(data_base_input_dir)
isExists2 = os.path.exists(data_base_output_dir)

if not isExists1:
data_base_input_dir = os.mkdir(data_base_input_dir)
print(data_base_input_dir)

if not isExists2:
data_base_output_dir = os.mkdir(data_base_output_dir)
print(data_base_output_dir)

nih_chest_xray_data_dir=os.path.join(data_base_input_dir,
os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
isExists3 = os.path.exists(nih_chest_xray_data_dir)
if not isExists3:
nih_chest_xray_data_dir = os.mkdir(nih_chest_xray_data_dir)

print(nih_chest_xray_data_dir)

other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))
data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))

ignored_images_set = set()

total_patient_number = 30805
NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists
manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images

patient_id_original = [i for i in range(1,total_patient_number + 1)]

bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))
bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)

bbox_patient_index_list = []
for index, item in bbox_patient_index_df.iteritems():
bbox_patient_index_list.append(int(item))

patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))
print("len of original patient id is", len(patient_id_original))
print("len of cleaned patient id is", len(patient_id))
print("len of unique patient id with annotated data",
len(list(set(bbox_patient_index_list))))
print("len of patient id with annotated data",bbox_df.shape[0])

random.seed(0)
random.shuffle(patient_id)

print("first ten patient ids are", patient_id[:10])

# training:valid:test=7:1:2
patient_id_train = patient_id[:int(total_patient_number * 0.7)]
patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]
# get the rest of the patient_id as the test set
patient_id_test = patient_id[int(total_patient_number * 0.8):]
patient_id_test.extend(bbox_patient_index_list)
patient_id_test = list(set(patient_id_test))

print("train:{} valid:{} test:{}".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))

pathologies_name_list = prj_consts.DISEASE_list
NIH_patients_and_labels_file = 'Data_Entry_2017.csv'

labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))


#show the label distribution

# Unique IDs frequencies can be computed using list comprehension or collections lib
# [[x,(list(crtData['fullID2'])).count(x)] for x in set(crtData['fullID2'])]
# for tallying, collections lib is faster than list comprehension
pathology_distribution = Counter(list(labels_df['Finding Labels']))

# Sort it by ID frequency (dict value)
sorted_by_freq = sorted(pathology_distribution.items(), key=lambda x: x[1], reverse=True)
print(len(sorted_by_freq))
print(sorted_by_freq[:20])
print(sorted_by_freq[-10:])

print(labels_df['Finding Labels'].str.split( '|', expand=False).str.join(sep='*').str.get_dummies(sep='*').sum())

def process_data(current_df, patient_ids):
image_name_index = []
image_labels = {}
for individual_patient in tqdm.tqdm(patient_ids):
for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():
processed_image_name = row['Image Index']
if processed_image_name in ignored_images_set:
pass
else:
image_name_index.append(processed_image_name)
image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)
for disease_index, ele in enumerate(pathologies_name_list):
if re.search(ele, row['Finding Labels'], re.IGNORECASE):
image_labels[processed_image_name][disease_index] = 1
else:
# redundant code but just to make it more readable
image_labels[processed_image_name][disease_index] = 0
# print("processed", row['Image Index'])
return image_name_index, image_labels


train_data_index, train_labels = process_data(labels_df, patient_id_train)
valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)
test_data_index, test_labels = process_data(labels_df, patient_id_test)

print("train, valid, test image number is:", len(train_data_index), len(valid_data_index), len(test_data_index))

# save the data
labels_all = {}
labels_all.update(train_labels)
labels_all.update(valid_labels)
labels_all.update(test_labels)

partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}

with open(os.path.join(data_partitions_dir, 'labels14_unormalized_cleaned.pickle'), 'wb') as f:
pickle.dump(labels_all, f)

with open(os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle'), 'wb') as f:
pickle.dump(partition_dict, f)

# also save the patient id partitions for pytorch training
with open(os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle'), 'wb') as f:
pickle.dump([patient_id_train, patient_id_valid,
patient_id_test,
list(set(bbox_patient_index_list))], f)

print(type(train_labels))
print({k: train_labels[k] for k in list(train_labels)[:5]})
157 changes: 157 additions & 0 deletions Predict-Lung-Disease-master/020_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import sys, os
import azure_chestxray_utils
import azure_chestxray_keras_utils
from keras.models import load_model
import os
import pickle
import cv2
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.utils import Sequence
from sklearn import metrics
from tensorflow.python.client import device_lib
import keras_contrib

path = os.getcwd()+r'\azure-share'
amlWBSharedDir = path

prj_consts = azure_chestxray_utils.chestxray_consts()
data_base_input_dir=os.path.join(amlWBSharedDir,
os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
data_base_output_dir=os.path.join(amlWBSharedDir,
os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))
weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list)))
fully_trained_weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.FULLY_PRETRAINED_MODEL_DIR_list)))

nih_chest_xray_data_dir = os.path.join(data_base_input_dir,
os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))

data_partitions_dir = os.path.join(data_base_output_dir,
os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))

label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')

partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')

model_file_name = 'azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5'
model = load_model(os.path.join(fully_trained_weights_dir, model_file_name))
model.save_weights(os.path.join(fully_trained_weights_dir, 'weights_only_'+model_file_name))
models_file_name= [os.path.join(fully_trained_weights_dir,
'weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5')]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"



resized_height = 224
resized_width = 224
num_channel = 3
num_classes = 14
batch_size = 100 #512

def get_available_gpus():
"""
Returns: number of GPUs available in the system
"""
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()
# get number of available GPUs
print("num of GPUs:", len(get_available_gpus()))

num_gpu = get_available_gpus()
# get number of available GPUs
print("num of GPUs:", len(get_available_gpus()))

pathologies_name_list = prj_consts.DISEASE_list
pathologies_name_list

stanford_result = [0.8094, 0.9248, 0.8638, 0.7345, 0.8676, 0.7802, 0.7680, 0.8887, 0.7901, 0.8878, 0.9371, 0.8047,
0.8062, 0.9164]


with open(label_path, 'rb') as f:
labels = pickle.load(f)

with open(partition_path, 'rb') as f:
partition = pickle.load(f)

class DataGenSequence(Sequence):
def __init__(self, labels, image_file_index, current_state):
self.batch_size = batch_size
self.labels = labels
self.img_file_index = image_file_index
self.current_state = current_state
self.len = len(self.img_file_index) // self.batch_size
print("for DataGenSequence", current_state, "total rows are:", len(self.img_file_index), ", len is", self.len)

def __len__(self):
return self.len

def __getitem__(self, idx):
# print("loading data segmentation", idx)
# make sure each batch size has the same amount of data
current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]
X = np.empty((self.batch_size, resized_height, resized_width, num_channel))
y = np.empty((self.batch_size, num_classes))

for i, image_name in enumerate(current_batch):
path = os.path.join(nih_chest_xray_data_dir, image_name)

# loading data

img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float16)
X[i, :, :, :] = img
y[i, :] = labels[image_name]

# only do random flipping in training status
if self.current_state == 'train':
# this is different from the training code
x_augmented = X
else:
x_augmented = X

return x_augmented, y



# load test data
X_test = np.empty((len(partition['test']), 224, 224, 3), dtype=np.float16)
y_test = np.empty((len(partition['test']) - len(partition['test']) % batch_size, 14), dtype=np.float16)

for i, npy in enumerate(partition['test']):
if (i < len(y_test)):
# round to batch_size
y_test[i, :] = labels[npy]

print("len of result is", len(y_test))
y_pred_list = np.empty((len(models_file_name), len(partition['test']), 14), dtype=np.float16)

# individual models
for index, current_model_file in enumerate(models_file_name):
print(current_model_file)
# model = load_model(current_model_file)
model = azure_chestxray_keras_utils.build_model(keras_contrib.applications.densenet.DenseNetImageNet121); model.load_weights(current_model_file)
print('evaluation for model', current_model_file)
# y_pred = model.predict(X_test)

y_pred = model.predict_generator(generator=DataGenSequence(labels, partition['test'], current_state='test'),
workers=32, verbose=1, max_queue_size=1)
print("result shape", y_pred.shape)

# add one fake row of ones in both test and pred values to avoid:
# ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
y_test = np.insert(y_test, 0, np.ones((y_test.shape[1],)), 0)
y_pred = np.insert(y_pred, 0, np.ones((y_pred.shape[1],)), 0)

df = pd.DataFrame(columns=['Disease', 'Our AUC Score', 'Stanford AUC Score'])
for d in range(14):
df.loc[d] = [pathologies_name_list[d],
metrics.roc_auc_score(y_test[:, d], y_pred[:, d]),
stanford_result[d]]

df['Delta'] = df['Stanford AUC Score'] - df['Our AUC Score']
df.to_csv(current_model_file + ".csv", index=False)
print(df)
Loading