From 776827fba3f80a3127d5d10cd21786b4856b560e Mon Sep 17 00:00:00 2001 From: papanikos <n.pappas@uu.nl> Date: Thu, 14 Jan 2021 21:05:52 +0100 Subject: [PATCH] add size filtering rule --- workflow/Snakefile | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 415e26c..ff19b64 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -51,6 +51,7 @@ def collect_prediction_tsvs(wc): rule all: input: expand([ + "results/{sample}/tmp/filtered.fa.gz", "results/{sample}/tmp/reflist.txt", "results/{sample}/{tool}/predictions.tsv", "results/{sample}/all_predictions.tsv", @@ -61,9 +62,24 @@ rule all: ########### ## RULES ## ########### -rule split_multifasta: + +rule size_filter: input: multifasta_fp = get_sample_fasta + output: + filtered_fasta = "results/{sample}/tmp/filtered.fa.gz" + threads: 4 + log: + "logs/{sample}/size_filter.log" + params: + min_size = 5000 + shell: + "seqkit seq -g -j {threads} -m {params.min_size} " + "{input.multifasta_fp} | gzip -c >{output.filtered_fasta} 2>{log}" + +rule split_multifasta: + input: + multifasta_fp = rules.size_filter.output.filtered_fasta output: reflist = "results/{sample}/tmp/reflist.txt" log: "logs/{sample}/split_multifasta.log" @@ -109,8 +125,6 @@ rule filter_vhulk: # RAFAH rule run_rafah: input: - # Run vhulk first since it writing in the input - rules.run_vhulk.output.done_txt, reflist = rules.split_multifasta.output.reflist, output: seq_info = "results/{sample}/rafah/{sample}_Seq_Info.tsv" @@ -140,8 +154,6 @@ rule filter_rafah: # VirHostMatcher-Net rule run_vhmnet: input: - # Run vhulk first since it writing in the input - rules.run_vhulk.output.done_txt, reflist = rules.split_multifasta.output.reflist, output: done = touch("results/{sample}/vhmnet/.done.txt") @@ -149,11 +161,10 @@ rule run_vhmnet: # use it with # snakemake --singularity-args "-B /path/to/data/:/data" ... data_dir = "/data", - #data_dir = DATA_DIR, tmp_dir = "results/{sample}/vhmnet/tmp", output_dir = "results/{sample}/vhmnet", fasta_dir = "results/{sample}/tmp/genomes" - threads: 8 + threads: 12 container: "library://papanikos_182/default/vhmnet:0.1" log: @@ -204,7 +215,6 @@ rule run_wish: # snakemake --singularity-args binds the whole data dir to /data # see run_vhmnet rule models_dir = "/data/host_wish_model", - #models_dir = DATA_DIR.joinpath("host_wish_model"), output_dir = "results/{sample}/wish", fasta_dir = "results/{sample}/tmp/genomes" shell: @@ -247,7 +257,7 @@ rule run_htp: pred=$(viruses_classifier \ --classifier svc \ --nucleic_acid dna \ - -p ${{f}}) 2>{log.stderr} + -p ${{f}} 2>{log.stderr}) echo -e $contig_id\t$pred >> {output.htp_raw} done """ @@ -261,7 +271,7 @@ rule process_htp: ob = "{" shell: """ - tail -n +2 results/A/htp/raw.txt | cut -f1 -d','| \ + tail -n +2 results/{wildcards.sample}/htp/raw.txt | cut -f1 -d','| \ sed -r "s/ \{params.ob}'phage'\: /\t/" | sort -k1 \ >{output.predictions_tsv} """ -- GitLab