AIMedLab
diff --git a/‎Eval.py
+45 b/‎Eval.py
+45
diff --git a/‎FAERSdata.py
+57 b/‎FAERSdata.py
+57
diff --git a/‎Model.py
+75 b/‎Model.py
+75
diff --git a/‎README.md
+35 b/‎README.md
+35
diff --git a/‎img/pipeline.jpg
585 KB b/‎img/pipeline.jpg
585 KB
diff --git a/‎mapping.py
+14 b/‎mapping.py
+14
diff --git a/‎pickles/drugid2rxnorm_mapping.pkl
95 KB b/‎pickles/drugid2rxnorm_mapping.pkl
95 KB
diff --git a/‎pickles/rxnorm2features_mapping.pkl
4.61 MB b/‎pickles/rxnorm2features_mapping.pkl
4.61 MB
diff --git a/‎pickles/sider_eval_pairs_final.pkl
960 KB b/‎pickles/sider_eval_pairs_final.pkl
960 KB
diff --git a/‎run.py
+115 b/‎run.py
+115
@@ -0,0 +1,45 @@
+from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score, accuracy_score, precision_recall_curve
+import numpy as np
+
+class Eval:
+    def __init__(self, pred, gold):
+        self.pred = pred
+        self.gold = gold
+
+    def Metrics(self, metics):
+        if metics == 'all':
+            auc = roc_auc_score(self.gold, self.pred)
+            aupr = average_precision_score(self.gold, self.pred)
+            recalls, precisions, thresholds_pr = precision_recall_curve(self.gold, self.pred)
+            f1s = (2 * np.multiply(precisions, recalls)) / np.add(precisions, recalls)
+            f1s = np.nan_to_num(f1s)
+            max_idx = int(np.argmax(f1s))
+            precision = precisions[max_idx]
+            recall = recalls[max_idx]
+            f1 = f1s[max_idx]
+            threshold = thresholds_pr[max_idx]
+            y_scores_label = np.copy(self.pred)
+            y_scores_label = np.where(y_scores_label > threshold, 1, 0)
+            y_scores_label = y_scores_label.astype(int)
+            accuracy = accuracy_score(self.gold, y_scores_label)
+            return np.array([auc, aupr, precision, recall, accuracy, f1])
+        elif metics == 'specificity-sensitivity':
+            # recall, sensitivity: true positive rate tp/(tp+fn)
+            # specificity: true negative rate tn/(tn+fp)
+            auc = roc_auc_score(self.gold, self.pred)
+
+            fixed_sensitivity, fixed_specificity = [], []
+            fpr, tpr, _ = roc_curve(self.gold, self.pred)
+            sensitivity, specificity = tpr, 1 - fpr
+
+            for i in range(1, 10):
+                value = i * 0.1
+                sensitivity_idx = np.argmin(np.abs(sensitivity-value))
+                spec = specificity[sensitivity_idx]
+                fixed_sensitivity.append(spec)
+
+                specificity_idx = np.argmin(np.abs(specificity-value))
+                sen = sensitivity[specificity_idx]
+                fixed_specificity.append(sen)
+
+            return auc, np.array(fixed_sensitivity), np.array(fixed_specificity)
@@ -0,0 +1,57 @@
+import os
+import numpy as np
+from tqdm import tqdm
+
+from mapping import sider_eval_pairs, drug2id, adr2id, drug_list, adr_list
+class FAERSdata:
+    def __init__(self, directory, method, year):
+
+        Files = os.listdir('%s/%s' % (directory, method))
+
+        if year == 'all':
+            Files = [Files[-1]]
+
+        X = {}
+        Y = {}
+        Index = {}
+        for i in tqdm(range(len(Files))):
+            f = Files[i]
+            x = np.zeros(shape=(len(drug_list), len(adr_list)))
+            with open('%s/%s/%s' % (directory, method, f), 'r') as ff:
+                next(ff)
+                for line in ff:
+                    line = line.strip('\n')
+                    line = line.split(',')
+                    drug, adr, score = line[0], line[1], round(float(line[2]),5)
+                    drug_id, adr_id = drug2id.get(drug), adr2id.get(adr)
+                    if drug in drug_list and adr in adr_list:
+                        x[drug_id, adr_id] = score
+
+            y = np.zeros(shape=(len(drug_list), len(adr_list)))
+            for drug, adr in sider_eval_pairs:
+                drug_id, adr_id = drug2id.get(drug), adr2id.get(adr)
+                y[drug_id, adr_id] = 1
+
+            y = np.asarray(y)
+            index = np.arange(x.shape[0])
+
+            X[i] = x
+            Y[i] = y
+            Index[i] = index.tolist()
+
+        self.X = X
+        self.Y = Y
+        self.Index = Index
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,75 @@
+import numpy as np
+import pickle
+from collections import defaultdict
+
+from Eval import Eval
+from mapping import drugid2rxnorm, rxnorm2features, id2drug, id2adr
+from utils import split_data
+from similarity import get_Jaccard_Similarity
+
+
+class Model:
+    def __init__(self, metrics):
+        self.ALPHA = 0.1
+        self.metrics = metrics
+
+    def get_similarity_matrix(self, X):
+        features_matrix = []
+        for idx in range(X.shape[0]):
+            drug = id2drug.get(idx)
+            rxnorm = drugid2rxnorm[drug]
+            features = rxnorm2features[rxnorm]
+            features_matrix.append(features)
+        features_matrix = np.asarray(features_matrix)
+        return get_Jaccard_Similarity(features_matrix)
+
+    def label_propogation(self, X, alpha):
+        similarity_matrix = self.get_similarity_matrix(X)
+        score_matrix_drug = (1 - alpha) * np.matmul(np.linalg.pinv(
+            np.eye(np.shape(X)[0]) - alpha * similarity_matrix), X)
+        return score_matrix_drug
+
+    def validate(self, X, Y, idx):
+        AUC = []
+        for i in range(1, 10):
+            alpha = i * 0.1
+            Y_pred = self.predict(X, alpha)
+            metrics = self.eval(Y_pred, Y, idx)
+            auc = metrics[0]
+            AUC.append(auc)
+        print(AUC)
+        max_auc = max(AUC)
+        max_idx = AUC.index(max_auc)
+        max_alpha = (max_idx + 1) * 0.1
+        self.ALPHA = max_alpha
+
+    def predict(self, X, alpha):
+        Y_pred = self.label_propogation(X, alpha)
+        return Y_pred
+
+    def eval(self, Y_pred, Y, idx):
+        y_pred, y_gold = [], []
+        for r, c in zip(idx[0], idx[1]):
+            y_pred.append(Y_pred[r, c])
+            y_gold.append(Y[r, c])
+        ev = Eval(y_pred, y_gold)
+        return ev.Metrics(self.metrics)
+
+
+    def eval_DME(self, Y_pred, Y, idx, DME):
+        y_pred, y_gold = defaultdict(list), defaultdict(list)
+        for r, c in zip(idx[0], idx[1]):
+            adrid = id2adr.get(c)
+            if adrid in DME:
+                y_pred[adrid].append(Y_pred[r, c])
+                y_gold[adrid].append(Y[r, c])
+        EV = {}
+        for k in y_pred.keys():
+            y_p, y_g = y_pred.get(k), y_gold.get(k)
+            ev = Eval(y_p, y_g)
+            EV[k] = ev.Metrics(self.metrics)
+        return EV
+
+
+
+
@@ -0,0 +1,35 @@
+# LP-SDA
+
+## 1. Introduction
+This repository contains source code for paper ["Towards early detection of adverse drugreactions: combining pre-clinical drug structuresand post-market safety reports"]() (accepted by **_BMC Medical Informatics andDecision Making_**). 
+In this paper, we propose a label propagation framework to enhance drug safety signals by combining pre-clinical drug chemical structures with post-marketing safety reports from [FDA Adverse Event Reporting System (FAERS)](https://open.fda.gov/data/faers/). 
+
+We apply the label propagation framework to four popular signal detection algorithms (PRR, ROR,MGPS, BCPNN) and find that our proposed framework generates more accurate drug safety signals than the corresponding baselines.
+
+## 2. Pipeline
+![alt text](img/pipeline.jpg "Pipeline")
+
+Fig. 1: The overall framework for label propagation based signal detection algorithms. It consists of three main steps: computing original drug safety signals from FAERS reports, constructing a drug-drug similarity network from pre-clinical drug structures, and generating enhanced drug safety signals through a label propagation process.
+
+## 3. Dataset
+Datasets used in the paper:
+- [FAERS](https://open.fda.gov/data/faers/): a database that contains information on adverse event and medication error reports submitted to FDA. We use a curated and standardized version of FAERS data from 2004 to 2014 (Banda, Juan M. et al., 2017) [[paper&data]](https://datadryad.org/stash/dataset/doi:10.5061/dryad.8q0s4).
+- [PubChem](https://www.ncbi.nlm.nih.gov/pubmed/26400175): a public repository for information on chemical substances and their biological activities. The PubChem Compound database provides  unique chemical structure information of drugs.
+- [SIDER](http://sideeffects.embl.de/): a database that contains information on marketed medicines and their recorded adverse drug reactions. 
+
+## 4. Code
+#### Running example
+```
+python run.py --input SignalScoresSource --method PRR05 --year all --eval_metrics all --split True
+```
+
+#### Parameters
+- --input, input original signal scores files. 
+- --method, signal detection algorithm (i.e., PRR, ROR, MGPS, BCPNN).
+- --year, years of data used for model (i.e., all years data from 2004 to 2014 or data arranged by ending years).
+- --eval_metrics, evaluation metrics (i.e., AUC, AUPR, Precision, Recall, etc.)
+- --split, whether to split entire dataset into validation set and testing set.
+- --output, output file.
+
+## 5. Citation
+Please kindly cite the paper if you use the code, datasets or any results in this repo or in the paper:
@@ -0,0 +1,14 @@
+import pickle
+
+sider_eval_pairs = pickle.load(open('pickles/sider_eval_pairs_final.pkl', 'rb'))
+drugid2rxnorm = pickle.load(open('pickles/drugid2rxnorm_mapping.pkl', 'rb'))
+rxnorm2features = pickle.load(open('pickles/rxnorm2features_mapping.pkl', 'rb'))
+
+drug_list = list(set(drug for (drug, adr) in sider_eval_pairs))
+adr_list = list(set(adr for (drug, adr) in sider_eval_pairs))
+
+id2drug = {i: drug for i, drug in enumerate(drug_list)}
+drug2id = {drug: i for i, drug in enumerate(drug_list)}
+
+id2adr = {i: adr for i, adr in enumerate(adr_list)}
+adr2id = {adr: i    for i, adr in enumerate(adr_list)}
@@ -0,0 +1,115 @@
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
+import numpy as np
+
+from FAERSdata import FAERSdata
+from Model import Model
+from utils import split_data, sample_zeros
+
+
+def parse_args():
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, conflict_handler='resolve')
+    parser.add_argument('--input', required=True, help='Input original signal scores file.')
+    parser.add_argument('--method', required=True, choices=['PRR05', 'ROR05', 'GPS', 'BCPNN'], help='Signal detection algorithm')
+    parser.add_argument('--year', default='all', choices=['all', 'each'], help='Years of data used for model')
+    parser.add_argument('--eval_metrics', required=True, choices=['all', 'specificity-sensitivity'],
+                        help='Evaluation metrics')
+    parser.add_argument('--split', type=bool, default=False)
+    parser.add_argument('--output')
+
+    args = parser.parse_args()
+    return args
+
+
+def pretty_print_eval(res, metrics):
+    if metrics == 'all':
+        print('All metrics: ' + ','.join(np.round(res,3).astype(str)))
+    else:
+        print('fixed_sensitivity: ' + ','.join(np.round(res[1],3).astype(str)))
+        print('fixed_specificity: ' + ','.join(np.round(res[2],3).astype(str)))
+
+
+def main(args):
+    print('#' * 50)
+    print('Signal Detection Algorithm: {}, Year: {}'.format(args.method, args.year))
+    print('#' * 50)
+
+
+    data = FAERSdata(args.input, args.method, args.year)
+
+    for i in range(len(data.X.keys())):
+        X, Y, _ = data.X.get(i), data.Y.get(i), data.Index.get(i)
+        # all_idx = np.where(Y > -1)
+        all_idx = sample_zeros(Y)
+        if args.split:
+            valid, test = split_data(Y)
+            model = Model(args.eval_metrics)
+            model.validate(X, Y, valid)
+            Y_pred = model.predict(X, model.ALPHA)
+            valid_res = model.eval(Y_pred, Y, valid)
+            test_res = model.eval(Y_pred, Y, test)
+            print('LP-{}:'.format(args.method))
+            print('alpha: {}'.format(model.ALPHA))
+            print('valid:')
+            pretty_print_eval(valid_res, args.eval_metrics)
+            print('test:')
+            pretty_print_eval(test_res, args.eval_metrics)
+
+            valid_res = model.eval(X, Y, valid)
+            test_res = model.eval(X, Y, test)
+            print('baseline-{}:'.format(args.method))
+            print('valid:')
+            pretty_print_eval(valid_res, args.eval_metrics)
+            print('test:')
+            pretty_print_eval(test_res, args.eval_metrics)
+        else:
+            model = Model(args.eval_metrics)
+            model.validate(X, Y, all_idx)
+            Y_pred = model.predict(X, model.ALPHA)
+            res = model.eval(Y_pred, Y, all_idx)
+            print('LP-{}:'.format(args.method))
+            pretty_print_eval(res, args.eval_metrics)
+
+            print('baseline-{}:'.format(args.method))
+            res = model.eval(X, Y, all_idx)
+            pretty_print_eval(res, args.eval_metrics)
+
+def main_DME(args):
+    print('#' * 50)
+    print('Signal Detection Algorithm: {}, Year: {}'.format(args.method, args.year))
+    print('#' * 50)
+
+    data = FAERSdata(args.input, args.method, args.year)
+    DME = np.loadtxt('DME.txt', dtype=str, delimiter=',')
+    adr_id, adr_name = DME[:,0], DME[:,1]
+
+    out = open(args.output, 'w')
+    # out.write('ID,Name,AUC,AUC,AUPR,AUPR,Precision,Precision,Recall,Recall,Accuracy,Accuracy,F1,F1\n')
+    for i in range(len(data.X.keys())):
+        X, Y, _ = data.X.get(i), data.Y.get(i), data.Index.get(i)
+        # all_idx = np.where(Y > -1)
+        eval_idx = sample_zeros(Y)
+        model = Model(args.eval_metrics)
+        Y_pred = model.predict(X, model.ALPHA)
+        LP_res = model.eval_DME(Y_pred, Y, eval_idx, adr_id)
+        baseline_res = model.eval_DME(X, Y, eval_idx, adr_id)
+        for i, adr in enumerate(list(adr_id)):
+            print('LP-{}:'.format(args.method))
+            LP_metric = LP_res.get(adr)
+            print('ADR:{} '.format(adr))
+            pretty_print_eval(LP_metric, args.eval_metrics)
+
+            print('baseline-{}:'.format(args.method))
+            baseline_metric = baseline_res.get(adr)
+            pretty_print_eval(baseline_metric, args.eval_metrics)
+
+            out.write('{},{},{},{}\n'.format(adr, adr_name[i], ','.join(np.round(LP_metric,3).astype(str)), ','.join(np.round(baseline_metric,3).astype(str))))
+
+    out.close()
+
+
+def more_main():
+    args = parse_args()
+    main(args)
+
+if __name__ == '__main__':
+    more_main()