--- # name: "kg-phenio" # description: "kg-phenio" Target: target_path: graph_ml Upload: s3_bucket: kg-hub-public-data s3_bucket_dir: kg-phenio/20241104/graph_ml/ extra_args: "ACL": "public-read" GraphDataConfiguration: source_data: files: - path: https://kg-hub.berkeleybop.io/kg-phenio/20241104/kg-phenio.tar.gz desc: "Location of KG-Phenio nodefile, edgefile, prefixcats nodes, and validation subgraphs." graph: directed: False node_path: merged-kg_nodes-prefixcats.tsv edge_path: merged-kg_edges.tsv verbose: True nodes_column: 'id' node_list_node_types_column: 'category' default_node_type: 'biolink:NamedThing' sources_column: 'subject' destinations_column: 'object' default_edge_type: 'biolink:related_to' evaluation_data: valid_data: pos_edge_filepath: pos_valid_edges.tsv neg_edge_filepath: neg_valid_edges.tsv train_data: neg_edge_filepath: neg_train_edges.tsv # Generate new embeddings # These stats are reduced from usual while I sort out some bugs EmbeddingsConfig: filename: kg-phenio-SkipGram-minimal.tsv history_filename: embedding_history.json node_embedding_params: node_embedding_method_name: SkipGram use_mirrored_strategy: False walk_length: 20 #100 batch_size: 64 #128 window_size: 4 return_weight: 1.0 explore_weight: 1.0 iterations: 5 #20 tsne_file_name: tsne.png ClassifierContainer: classifiers: - classifier_id: mlp_0 classifier_name: neural network classifier_type: tensorflow.keras.models.Sequential edge_method: Average outfile: "model_mlp_kg-phenio.model" history_filename: "model_mlp_kg-phenio_history.json" parameters: tf_keras_params: layers_config: layers: - type: tensorflow.keras.layers.Input parameters: shape: 100 # must match embedding_size up above - type: tensorflow.keras.layers.Dense parameters: units: 128 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 32 activation: relu - type: tensorflow.keras.layers.Dropout parameters: rate: 0.5 - type: tensorflow.keras.layers.Dense parameters: units: 16 activation: relu - type: tensorflow.keras.layers.Dense parameters: units: 1 activation: sigmoid loss: binary_crossentropy optimizer: nadam metrics_config: metrics: - name: auprc type: tensorflow.keras.metrics.AUC curve: PR - name: auroc type: tensorflow.keras.metrics.AUC curve: ROC - name: Recall type: tensorflow.keras.metrics.Recall - name: Precision type: tensorflow.keras.metrics.Precision - type: accuracy fit_config: batch_size: 4096 epochs: 10 callbacks_list: callbacks: - type: tensorflow.keras.callbacks.EarlyStopping monitor: val_loss patience: 5 min_delta: 0.001 - type: tensorflow.keras.callbacks.ReduceLROnPlateau # - classifier_id: dt_0 # type: Decision Tree # edge_method: Average # model: # outfile: "model_decisiontree_kg-ontoml.model" # type: sklearn.tree.DecisionTreeClassifier # parameters: # max_depth: 30 # random_state: 42 - classifier_id: rf_0 classifier_name: Random Forest classifier_type: sklearn.ensemble.RandomForestClassifier edge_method: Average outfile: "model_randomforest_kg-phenio.model" parameters: sklearn_params: n_estimators: 500 max_depth: 30 n_jobs: 8 random_state: 42 - classifier_id: lr_0 classifier_name: Logistic Regression classifier_type: sklearn.linear_model.LogisticRegression edge_method: Average outfile: "model_lr_kg-phenio.model" parameters: sklearn_params: random_state: 42 max_iter: 2000 ApplyTrainedModelsContainer: models: - model_id: mlp_0 cutoff: 0.9 outfile: mlp_classifier_predictions_kgx.tsv node_types: source: - "HP" destination: - "MP" # - # classifier_model_id: dt_0 # cutoff: 0.9 # outfile: dt_classifier_predictions_kgx.tsv # link_node_types: # source: # - 'HP' # destination: # - 'MP' - model_id: rf_0 cutoff: 0.9 outfile: rf_classifier_predictions_kgx.tsv node_types: source: - "HP" destination: - "MP" - model_id: lr_0 cutoff: 0.9 outfile: lr_classifier_predictions_kgx.tsv node_types: source: - "HP" destination: - "MP"