From 776827fba3f80a3127d5d10cd21786b4856b560e Mon Sep 17 00:00:00 2001
From: papanikos <n.pappas@uu.nl>
Date: Thu, 14 Jan 2021 21:05:52 +0100
Subject: [PATCH] add size filtering rule

---
 workflow/Snakefile | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 415e26c..ff19b64 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -51,6 +51,7 @@ def collect_prediction_tsvs(wc):
 rule all:
 	input:
 		expand([
+			"results/{sample}/tmp/filtered.fa.gz",
 			"results/{sample}/tmp/reflist.txt",
 			"results/{sample}/{tool}/predictions.tsv",
 			"results/{sample}/all_predictions.tsv",
@@ -61,9 +62,24 @@ rule all:
 ###########
 ## RULES ##
 ###########
-rule split_multifasta:
+
+rule size_filter:
 	input:
 		multifasta_fp = get_sample_fasta
+	output:
+		filtered_fasta = "results/{sample}/tmp/filtered.fa.gz"
+	threads: 4
+	log:
+		"logs/{sample}/size_filter.log"
+	params:
+		min_size = 5000
+	shell:
+		"seqkit seq -g -j {threads} -m {params.min_size} "
+		"{input.multifasta_fp} | gzip -c >{output.filtered_fasta} 2>{log}" 
+
+rule split_multifasta:
+	input:
+		multifasta_fp = rules.size_filter.output.filtered_fasta
 	output:
 		reflist = "results/{sample}/tmp/reflist.txt"
 	log: "logs/{sample}/split_multifasta.log"
@@ -109,8 +125,6 @@ rule filter_vhulk:
 # RAFAH
 rule run_rafah:
 	input:
-		# Run vhulk first since it writing in the input
-		rules.run_vhulk.output.done_txt,
 		reflist = rules.split_multifasta.output.reflist,
 	output:
 		seq_info = "results/{sample}/rafah/{sample}_Seq_Info.tsv"
@@ -140,8 +154,6 @@ rule filter_rafah:
 # VirHostMatcher-Net
 rule run_vhmnet:
 	input:
-		# Run vhulk first since it writing in the input
-		rules.run_vhulk.output.done_txt,
 		reflist = rules.split_multifasta.output.reflist,
 	output:
 		done = touch("results/{sample}/vhmnet/.done.txt")
@@ -149,11 +161,10 @@ rule run_vhmnet:
 		# use it with
 		# snakemake --singularity-args "-B /path/to/data/:/data" ...
 		data_dir = "/data",
-		#data_dir = DATA_DIR,
 		tmp_dir = "results/{sample}/vhmnet/tmp",
 		output_dir = "results/{sample}/vhmnet",
 		fasta_dir = "results/{sample}/tmp/genomes"
-	threads: 8
+	threads: 12
 	container:
 		"library://papanikos_182/default/vhmnet:0.1"
 	log:
@@ -204,7 +215,6 @@ rule run_wish:
 		# snakemake --singularity-args binds the whole data dir to /data
 		# see run_vhmnet rule
 		models_dir = "/data/host_wish_model",
-		#models_dir = DATA_DIR.joinpath("host_wish_model"),
 		output_dir = "results/{sample}/wish",
 		fasta_dir = "results/{sample}/tmp/genomes"
 	shell:
@@ -247,7 +257,7 @@ rule run_htp:
 			pred=$(viruses_classifier \
 					--classifier svc \
 					--nucleic_acid dna \
-					-p ${{f}}) 2>{log.stderr}
+					-p ${{f}} 2>{log.stderr})
 			echo -e $contig_id\t$pred >> {output.htp_raw}
 		done
 		"""
@@ -261,7 +271,7 @@ rule process_htp:
 		ob = "{"
 	shell:
 		"""
-		tail -n +2 results/A/htp/raw.txt | cut -f1 -d','| \
+		tail -n +2 results/{wildcards.sample}/htp/raw.txt | cut -f1 -d','| \
 				sed -r "s/ \{params.ob}'phage'\: /\t/" | sort -k1 \
 				>{output.predictions_tsv}
 		"""
-- 
GitLab