--- # name: "kg-idg" # description: "KG-IDG" Target: target_path: graph_ml Upload: s3_bucket: kg-hub-public-data s3_bucket_dir: kg-idg/20230101/graph_ml/ extra_args: "ACL": "public-read" GraphDataConfiguration: source_data: files: - path: https://kg-hub.berkeleybop.io/kg-idg/THIS-BUILD-ID/KG-IDG.tar.gz desc: "Location of KG-IDG nodefile, edgefile, and validation subgraphs." graph: directed: False node_path: merged-kg_nodes.tsv edge_path: merged-kg_edges.tsv verbose: True nodes_column: 'id' node_list_node_types_column: 'category' default_node_type: 'biolink:NamedThing' sources_column: 'subject' destinations_column: 'object' default_edge_type: 'biolink:related_to' evaluation_data: valid_data: pos_edge_filepath: pos_valid_edges.tsv neg_edge_filepath: neg_valid_edges.tsv train_data: neg_edge_filepath: neg_train_edges.tsv EmbeddingsConfig: filename: KG-IDG-SkipGram.tsv history_filename: embedding_history.json node_embedding_params: node_embedding_method_name: SkipGram use_mirrored_strategy: False walk_length: 100 batch_size: 128 window_size: 4 return_weight: 1.0 explore_weight: 1.0 iterations: 20 tsne_file_name: tsne.png ClassifierContainer: classifiers: - classifier_id: mlp_0 classifier_name: neural network classifier_type: tensorflow.keras.models.Sequential edge_method: Average outfile: "model_mlp_kg-idg.model" history_filename: "model_mlp_kg-idg_history.json" parameters: tf_keras_params: layers_config: layers: - type: tensorflow.keras.layers.Input parameters: shape: 100 # must match embedding_size up above - type: tensorflow.keras.layers.Dense parameters: units: 128 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 32 activation: relu - type: tensorflow.keras.layers.Dropout parameters: rate: 0.5 - type: tensorflow.keras.layers.Dense parameters: units: 16 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 1 activation: sigmoid loss: binary_crossentropy optimizer: nadam metrics_config: metrics: - name: auprc type: tensorflow.keras.metrics.AUC curve: PR - name: auroc type: tensorflow.keras.metrics.AUC curve: ROC - name: Recall type: tensorflow.keras.metrics.Recall - name: Precision type: tensorflow.keras.metrics.Precision - type: accuracy fit_config: batch_size: 4096 epochs: 10 callbacks_list: callbacks: - type: tensorflow.keras.callbacks.EarlyStopping monitor: val_loss patience: 5 min_delta: 0.001 - type: tensorflow.keras.callbacks.ReduceLROnPlateau - classifier_id: rf_0 classifier_name: Random Forest classifier_type: sklearn.ensemble.RandomForestClassifier edge_method: Average outfile: "model_randomforest_kg-idg.model" parameters: sklearn_params: n_estimators: 500 max_depth: 30 n_jobs: 8 random_state: 42 - classifier_id: lr_0 classifier_name: Logistic Regression classifier_type: sklearn.linear_model.LogisticRegression edge_method: Average outfile: "model_lr_kg-idg.model" parameters: sklearn_params: random_state: 42 max_iter: 2000 ApplyTrainedModelsContainer: models: - model_id: mlp_0 cutoff: 0.8 outfile: mlp_classifier_predictions_kgx.tsv node_types: source: - 'biolink:Drug' - 'biolink:Protein' destination: - 'biolink:Drug' - 'biolink:Protein' - model_id: rf_0 cutoff: 0.8 outfile: rf_classifier_predictions_kgx.tsv node_types: source: - 'biolink:Drug' - 'biolink:Protein' destination: - 'biolink:Drug' - 'biolink:Protein' - model_id: lr_0 cutoff: 0.8 outfile: lr_classifier_predictions_kgx.tsv node_types: source: - 'biolink:Drug' - 'biolink:Protein' destination: - 'biolink:Drug' - 'biolink:Protein'