--- name: "kg-ontoml" description: "KG-OntoML" output_directory: graph_ml upload: s3_bucket: kg-hub-public-data s3_bucket_dir: kg-ontoml/20220429/graph_ml/ extra_args: 'ACL': 'public-read' # We already have everything necessary to train # classifiers, but for now NEAT complains (KeyError) # if a graph_data block isn't present. graph_data: graph: directed: False # This file contains the graph edges/nodes AND the pos/neg train/valid graphs below # as well as the graph embeddings # Original graph is at https://kg-hub.berkeleybop.io/kg-ontoml/20220304/KG-OntoML.tar.gz graph_path: https://kg-hub.berkeleybop.io/frozen_incoming_data/kg-ontoml-bundle-prefixcats.tar.gz node_path: merged-kg_nodes-prefixcats.tsv edge_path: merged-kg_edges.tsv verbose: True nodes_column: 'id' node_list_node_types_column: 'category' default_node_type: 'biolink:NamedThing' sources_column: 'subject' destinations_column: 'object' default_edge_type: 'biolink:related_to' pos_validation: edge_path: upheno_mapping_all_edges.tsv neg_training: edge_path: negative_edges.tsv neg_validation: edge_path: negative_valid_edges.tsv # For now, this embedding is static, and packaged with the file # in graph_path above. # In practice, each run will require generation of new embeddings. embeddings: embedding_file_name: KG-OntoML-SkipGram.tsv embedding_history_file_name: embedding_history.json node_embedding_params: node_embedding_method_name: SkipGram use_mirrored_strategy: False walk_length: 100 batch_size: 128 window_size: 4 return_weight: 1.0 explore_weight: 1.0 iterations: 20 classifiers: - classifier_id: mlp_0 type: neural network edge_method: Average model: outfile: "model_mlp_kg-ontoml.model" classifier_history_file_name: "model_mlp_kg-ontoml_history.json" type: tensorflow.keras.models.Sequential layers: - type: tensorflow.keras.layers.Input parameters: shape: 100 # must match embedding_size up above - type: tensorflow.keras.layers.Dense parameters: units: 128 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 32 activation: relu # TODO: fix this: # activity_regularizer: tensorflow.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4) - type: tensorflow.keras.layers.Dropout parameters: rate: 0.5 - type: tensorflow.keras.layers.Dense parameters: units: 16 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 1 activation: sigmoid model_compile: loss: binary_crossentropy optimizer: nadam metrics: - type: tensorflow.keras.metrics.AUC parameters: curve: PR name: auprc - type: tensorflow.keras.metrics.AUC parameters: curve: ROC name: auroc - type: tensorflow.keras.metrics.Recall parameters: name: Recall - type: tensorflow.keras.metrics.Precision parameters: name: Precision - type: accuracy model_fit: parameters: batch_size: 4096 epochs: 10 callbacks: - type: tensorflow.keras.callbacks.EarlyStopping parameters: monitor: val_loss patience: 5 min_delta: 0.001 # min improvement to be considered progress - type: tensorflow.keras.callbacks.ReduceLROnPlateau - classifier_id: dt_0 type: Decision Tree edge_method: Average model: outfile: "model_decisiontree_kg-ontoml.model" type: sklearn.tree.DecisionTreeClassifier parameters: max_depth: 30 random_state: 42 - classifier_id: rf_0 type: Random Forest edge_method: Average model: outfile: "model_randomforest_kg-ontoml.model" type: sklearn.ensemble.RandomForestClassifier parameters: n_estimators: 500 max_depth: 30 n_jobs: 8 random_state: 42 - classifier_id: lr_0 type: Logistic Regression edge_method: Average model: outfile: "model_lr_kg-ontoml.model" type: sklearn.linear_model.LogisticRegression parameters: random_state: 42 max_iter: 2000 apply_trained_classifier: - classifier_model_id: mlp_0 cutoff: 0.9 outfile: mlp_classifier_predictions_kgx.tsv link_node_types: source: - 'HP' destination: - 'MP' - classifier_model_id: dt_0 cutoff: 0.9 outfile: dt_classifier_predictions_kgx.tsv link_node_types: source: - 'HP' destination: - 'MP' - classifier_model_id: rf_0 cutoff: 0.9 outfile: rf_classifier_predictions_kgx.tsv link_node_types: source: - 'HP' destination: - 'MP' - classifier_model_id: lr_0 cutoff: 0.9 outfile: lr_classifier_predictions_kgx.tsv link_node_types: source: - 'HP' destination: - 'MP'