README.md

# Clone this repo
$ git clone https://git.science.uu.nl/n.pappas/pvogs_function.git

# Get in there
$ cd pvogs_function

# Optional, if snakemake>=5.14 and conda available
$ conda env create -n my_env --file=environment.yml
$ conda activate my_env


# Dry run to check that it works
(my_env)$ snakemake --use-conda -n
$ conda env create -n my_env --file=./environment.yml
$ conda activate my_env
(my_env)$ snakemake --version
5.23.0
$ snakemake --use-conda -j16 -np
$ snakemake --use-conda -j16 -p
$ snakemake --use-conda -j16 --conda-frontend mamba
$ snakemake --use-conda -j16 --conda-frontend mamba --edit-notebook results/RF/best_model.pkl
# Skipping several thousands of intermediate files with the -I option
$ tree -n -I '*NC*.fasta|*_genes.*|*.gff|*.log' results

results
├── annotations.tsv
├── filtered_scores.tsv -------------------- * Table containing feature values for all interactions passing filtering
├── final_training_set.tsv
├── interaction_datasets
│   ├── 01_filter_intact
│   ├── 02_summarize_intact
│   ├── 03_uniprot
│   ├── 04_process_uniprot
│   ├── 05_genomes
│   ├── 06_map_proteins_to_pvogs
│   ├── N1  --------------------------------
....                                        | * Features, interactions, proteins, and pvogs are stored per dataset
│   └── positives --------------------------
│       ├── positives.features.tsv
│       ├── positives.interactions.tsv
│       ├── positives.proteins.faa
│       └── positives.pvogs_interactions.tsv
├── logs
├── predictions.tsv ------------------------- * Final predictions made
├── pre_process
│   ├── all_genomes
│   ├── comparem  --------------------------- * Directory with the final AAI matrix used
...
│   ├── fastani  ---------------------------- * Directory with the final ANI matrix used
│   ├── hmmsearch  -------------------------- * HMMER search results for all pvogs profiles agains the translated genomes
│   ├── reflist.txt
│   └── transeq
│       └── transeq.genomes.fasta
├── RF
│   ├── best_model_id.txt ------------------- * Contains the id of the negative dataset
│   ├── best_model.pkl ---------------------- * The best model obtained.
│   ├── features_stats.tsv ------------------ * Mean, max, min. std for feature importances
│   ├── features.tsv ------------------------ * Exact values of features importances for each combination of training/validation
│   ├── figures ----------------------------- * Figures used in the manuscript.
│   │   ├── Figure_1a.svg
        ....
....
│   ├── metrics.pkl
│   ├── metrics.stats.tsv ------------------- * Mean. max, min, std across all models
│   ├── metrics.tsv ------------------------- * Exact values of metrics for each combination of training/validation
│   └── models
│       ├── N10.RF.pkl ---------------------- * Best model obtained when optimizing with each negative set
        .....
.....
└── scores.tsv  ----------------------------- * Master table with feature values for all possible pVOGs combinations