BUPT · fatLime · May 20, 2019 · May 20, 2019 · May 27, 2019
diff --git a/Predict-Lung-Disease-master/000_preprocess.py b/Predict-Lung-Disease-master/000_preprocess.py
@@ -0,0 +1,164 @@
+import sys, os
+import azure_chestxray_utils
+import pickle
+import random
+import re
+import tqdm
+import cv2
+import numpy as np
+import pandas as pd
+import sklearn.model_selection
+from collections import Counter
+
+paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]
+def add_path_to_sys_path(path_to_append):
+    if not (any(path_to_append in paths for paths in sys.path)):
+        sys.path.append(path_to_append)
+
+[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]
+
+path= os.getcwd()+r'\azure-share'
+isExists=os.path.exists(path)
+if not isExists:
+    amlWBSharedDir = os.mkdir(path)
+else:
+    amlWBSharedDir = path
+
+
+
+
+prj_consts = azure_chestxray_utils.chestxray_consts()
+print(prj_consts)
+
+data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
+data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))
+
+isExists1 = os.path.exists(data_base_input_dir)
+isExists2 = os.path.exists(data_base_output_dir)
+
+if not isExists1:
+    data_base_input_dir = os.mkdir(data_base_input_dir)
+print(data_base_input_dir)
+
+if not isExists2:
+    data_base_output_dir = os.mkdir(data_base_output_dir)
+print(data_base_output_dir)
+
+nih_chest_xray_data_dir=os.path.join(data_base_input_dir,
+                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
+isExists3 = os.path.exists(nih_chest_xray_data_dir)
+if not isExists3:
+    nih_chest_xray_data_dir = os.mkdir(nih_chest_xray_data_dir)
+
+print(nih_chest_xray_data_dir)
+
+other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))
+data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))
+
+ignored_images_set = set()
+
+total_patient_number = 30805
+NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists
+manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images
+
+patient_id_original = [i for i in range(1,total_patient_number + 1)]
+
+bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))
+bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)
+
+bbox_patient_index_list = []
+for index, item in bbox_patient_index_df.iteritems():
+    bbox_patient_index_list.append(int(item))
+
+patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))
+print("len of original patient id is", len(patient_id_original))
+print("len of cleaned patient id is", len(patient_id))
+print("len of unique patient id with annotated data",
+      len(list(set(bbox_patient_index_list))))
+print("len of patient id with annotated data",bbox_df.shape[0])
+
+random.seed(0)
+random.shuffle(patient_id)
+
+print("first ten patient ids are", patient_id[:10])
+
+# training:valid:test=7:1:2
+patient_id_train = patient_id[:int(total_patient_number * 0.7)]
+patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]
+# get the rest of the patient_id as the test set
+patient_id_test = patient_id[int(total_patient_number * 0.8):]
+patient_id_test.extend(bbox_patient_index_list)
+patient_id_test = list(set(patient_id_test))
+
+print("train:{} valid:{} test:{}".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))
+
+pathologies_name_list = prj_consts.DISEASE_list
+NIH_patients_and_labels_file = 'Data_Entry_2017.csv'
+
+labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))
+
+
+#show the label distribution
+
+# Unique IDs frequencies can be computed using list comprehension or collections lib
+# [[x,(list(crtData['fullID2'])).count(x)] for x in set(crtData['fullID2'])]
+# for tallying, collections lib is faster than list comprehension
+pathology_distribution = Counter(list(labels_df['Finding Labels']))
+
+# Sort it by ID frequency (dict value)
+sorted_by_freq = sorted(pathology_distribution.items(), key=lambda x: x[1], reverse=True)
+print(len(sorted_by_freq))
+print(sorted_by_freq[:20])
+print(sorted_by_freq[-10:])
+
+print(labels_df['Finding Labels'].str.split( '|', expand=False).str.join(sep='*').str.get_dummies(sep='*').sum())
+
+def process_data(current_df, patient_ids):
+    image_name_index = []
+    image_labels = {}
+    for individual_patient in tqdm.tqdm(patient_ids):
+        for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():
+            processed_image_name = row['Image Index']
+            if processed_image_name in ignored_images_set:
+                pass
+            else:
+                image_name_index.append(processed_image_name)
+                image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)
+                for disease_index, ele in enumerate(pathologies_name_list):
+                    if re.search(ele, row['Finding Labels'], re.IGNORECASE):
+                        image_labels[processed_image_name][disease_index] = 1
+                    else:
+                        # redundant code but just to make it more readable
+                        image_labels[processed_image_name][disease_index] = 0
+                # print("processed", row['Image Index'])
+    return image_name_index, image_labels
+
+
+train_data_index, train_labels = process_data(labels_df, patient_id_train)
+valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)
+test_data_index, test_labels = process_data(labels_df, patient_id_test)
+
+print("train, valid, test image number is:", len(train_data_index), len(valid_data_index), len(test_data_index))
+
+# save the data
+labels_all = {}
+labels_all.update(train_labels)
+labels_all.update(valid_labels)
+labels_all.update(test_labels)
+
+partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}
+
+with open(os.path.join(data_partitions_dir, 'labels14_unormalized_cleaned.pickle'), 'wb') as f:
+    pickle.dump(labels_all, f)
+
+with open(os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle'), 'wb') as f:
+    pickle.dump(partition_dict, f)
+
+# also save the patient id partitions for pytorch training
+with open(os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle'), 'wb') as f:
+    pickle.dump([patient_id_train, patient_id_valid,
+                 patient_id_test,
+                 list(set(bbox_patient_index_list))], f)
+
+print(type(train_labels))
+print({k: train_labels[k] for k in list(train_labels)[:5]})
diff --git a/Predict-Lung-Disease-master/020_evaluate.py b/Predict-Lung-Disease-master/020_evaluate.py
@@ -0,0 +1,157 @@
+import sys, os
+import azure_chestxray_utils
+import azure_chestxray_keras_utils
+from keras.models import load_model
+import os
+import pickle
+import cv2
+import numpy as np
+import pandas as pd
+from keras.models import load_model
+from keras.utils import Sequence
+from sklearn import metrics
+from tensorflow.python.client import device_lib
+import keras_contrib
+
+path = os.getcwd()+r'\azure-share'
+amlWBSharedDir = path
+
+prj_consts = azure_chestxray_utils.chestxray_consts()
+data_base_input_dir=os.path.join(amlWBSharedDir,
+                                 os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
+data_base_output_dir=os.path.join(amlWBSharedDir,
+                                  os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))
+weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list)))
+fully_trained_weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.FULLY_PRETRAINED_MODEL_DIR_list)))
+
+nih_chest_xray_data_dir = os.path.join(data_base_input_dir,
+                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
+
+data_partitions_dir = os.path.join(data_base_output_dir,
+                                os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))
+
+label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')
+
+partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')
+
+model_file_name = 'azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5'
+model = load_model(os.path.join(fully_trained_weights_dir, model_file_name))
+model.save_weights(os.path.join(fully_trained_weights_dir, 'weights_only_'+model_file_name))
+models_file_name= [os.path.join(fully_trained_weights_dir,
+                               'weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5')]
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+
+
+
+resized_height = 224
+resized_width = 224
+num_channel = 3
+num_classes = 14
+batch_size = 100 #512
+
+def get_available_gpus():
+    """
+    Returns: number of GPUs available in the system
+    """
+    local_device_protos = device_lib.list_local_devices()
+    return [x.name for x in local_device_protos if x.device_type == 'GPU']
+
+get_available_gpus()
+# get number of available GPUs
+print("num of GPUs:", len(get_available_gpus()))
+
+num_gpu = get_available_gpus()
+# get number of available GPUs
+print("num of GPUs:", len(get_available_gpus()))
+
+pathologies_name_list = prj_consts.DISEASE_list
+pathologies_name_list
+
+stanford_result = [0.8094, 0.9248, 0.8638, 0.7345, 0.8676, 0.7802, 0.7680, 0.8887, 0.7901, 0.8878, 0.9371, 0.8047,
+                   0.8062, 0.9164]
+
+
+with open(label_path, 'rb') as f:
+    labels = pickle.load(f)
+
+with open(partition_path, 'rb') as f:
+    partition = pickle.load(f)
+
+class DataGenSequence(Sequence):
+    def __init__(self, labels, image_file_index, current_state):
+        self.batch_size = batch_size
+        self.labels = labels
+        self.img_file_index = image_file_index
+        self.current_state = current_state
+        self.len = len(self.img_file_index) // self.batch_size
+        print("for DataGenSequence", current_state, "total rows are:", len(self.img_file_index), ", len is", self.len)
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, idx):
+        # print("loading data segmentation", idx)
+        # make sure each batch size has the same amount of data
+        current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]
+        X = np.empty((self.batch_size, resized_height, resized_width, num_channel))
+        y = np.empty((self.batch_size, num_classes))
+
+        for i, image_name in enumerate(current_batch):
+            path = os.path.join(nih_chest_xray_data_dir, image_name)
+
+            # loading data
+
+            img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float16)
+            X[i, :, :, :] = img
+            y[i, :] = labels[image_name]
+
+            # only do random flipping in training status
+        if self.current_state == 'train':
+            # this is different from the training code
+            x_augmented = X
+        else:
+            x_augmented = X
+
+        return x_augmented, y
+
+
+
+# load test data
+X_test = np.empty((len(partition['test']), 224, 224, 3), dtype=np.float16)
+y_test = np.empty((len(partition['test']) - len(partition['test']) % batch_size, 14), dtype=np.float16)
+
+for i, npy in enumerate(partition['test']):
+    if (i < len(y_test)):
+        # round to batch_size
+        y_test[i, :] = labels[npy]
+
+print("len of result is", len(y_test))
+y_pred_list = np.empty((len(models_file_name), len(partition['test']), 14), dtype=np.float16)
+
+# individual models
+for index, current_model_file in enumerate(models_file_name):
+    print(current_model_file)
+#    model = load_model(current_model_file)
+    model = azure_chestxray_keras_utils.build_model(keras_contrib.applications.densenet.DenseNetImageNet121); model.load_weights(current_model_file)
+    print('evaluation for model', current_model_file)
+    # y_pred = model.predict(X_test)
+
+    y_pred = model.predict_generator(generator=DataGenSequence(labels, partition['test'], current_state='test'),
+                                     workers=32, verbose=1, max_queue_size=1)
+    print("result shape", y_pred.shape)
+
+    # add one fake row of ones in both test and pred values to avoid:
+    # ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
+    y_test = np.insert(y_test, 0, np.ones((y_test.shape[1],)), 0)
+    y_pred = np.insert(y_pred, 0, np.ones((y_pred.shape[1],)), 0)
+
+    df = pd.DataFrame(columns=['Disease', 'Our AUC Score', 'Stanford AUC Score'])
+    for d in range(14):
+        df.loc[d] = [pathologies_name_list[d],
+                     metrics.roc_auc_score(y_test[:, d], y_pred[:, d]),
+                     stanford_result[d]]
+
+    df['Delta'] = df['Stanford AUC Score'] - df['Our AUC Score']
+    df.to_csv(current_model_file + ".csv", index=False)
+    print(df)