
SCRIPT_DIR = '/data/projects/kimona/scripts/bachelor_thesis/scripts'
# DATA_DIR = '/data/projects/kimona/data_18-03-08'
DATA_DIR = '/data/projects/kimona/data'
NEEDLE_DIR = '{}/009_needle'.format(DATA_DIR)

DATABASES = ['ncbi', 'viralzone', 'phagesdb']

# if HOSTS = '' , TRAIN_NUM should be bigger than 1
# HOSTS = 'mycobac,strepto,escheri,gordoni,pseudom,arthrob,lactoco,staphyl'
HOSTS = 'mycobac'
TRAIN_NUM = 8
TRAIN_PERCENTAGE = 0.8

THREADS = 12
SUFFIXES = ['genomes.fasta', 'genomes.conversion', 'genes.fasta', 'genes.conversion']

# CLUSTERS = [1, 2, 3]
CLUSTERS = [1]

VARIANCE_THRESHOLD = 0.01

# less 006_test.genomes.fasta | grep "^>" | tr -d '>' | tr '\n' ',' | less
# PHAGES = 'phage0005309,phage0005377,phage0005898,phage0001424,phage0005329,phage0002284,phage0003202,phage0003198,phage0001297,phage0004464,phage0004280,phage0002782,phage0003704,phage0004281,phage0001233,phage0005661,phage0005178,phage2001241,phage0000094,phage2001535,phage2000563,phage2001740,phage0002038,phage0002037,phage2001556,phage0006181,phage2001790,phage2001137,phage0000851,phage0000847,phage2000785,phage2001273,phage2000758,phage2000200,phage0000195,phage0003144,phage2000747,phage2001841,phage0000869,phage2002033,phage0004890,phage0004381,phage0006301,phage0004091,phage0003685,phage0001351,phage0002676,phage0002357,phage2002292,phage2001302,phage2000831,phage0000077,phage0001393,phage0001394,phage0001389,phage2000851,phage0005707,phage2001091,phage0001944,phage2000658,phage0005756,phage0001209,phage0000086,phage0001286,phage0003369,phage0001507,phage0001530,phage0004820,phage0000718,phage0004361,phage0002323,phage0002908,phage0003942,phage0002354,phage0000829,phage0004968,phage0003932,phage2002409,phage0002836,phage0001685,phage0002356,phage0004190,phage0001411,phage0000317,phage0004024,phage0001735,phage2001505,phage0002763,phage2000286,phage0005832,phage0002239,phage2002402,phage0001211,phage0005274,phage0005259,phage0001705,phage2001543,phage0005117,phage0000111,phage0005978,phage0000070,phage2001527,phage0002169,phage0000243,phage0003253,phage2001461,phage2002285,phage2000676,phage2000868,phage2000860,phage0001369,phage0005883,phage2002374,phage2001006,phage2000904,phage0000821,phage0004348,phage0004029,phage0002264,phage2001964,phage0001382,phage0000671,phage0002903,phage0005662,phage0000689,phage0001324,phage2000797,phage2000776,phage2001882,phage0000127,phage0002152,phage2000966,phage0001482,phage2000317,phage0004177,phage0004293,phage0001734,phage0005064,phage2001445,phage0004780,phage0006211,phage0002182,phage2000779,phage2001351,phage0000091,phage2001792,phage0001531,phage2001100,phage2001888,phage2000492,phage0005744,phage2000454,phage2001758,phage2002378,phage0006245,phage0003982,phage0003973,phage2002369,phage0000830,phage2001752,phage0000071,phage0000173,phage2002160,phage0004806,phage2002314,phage0002331,phage0004796,phage0002330,phage0003976,phage0005187,phage0002167,phage0003699,phage2000911,phage2002114,phage2002362,phage2000203,phage0001502,phage2001481,phage2002436,phage0004669,phage2002250,phage0002032,phage2000186,phage0006242,phage2000672,phage0000841,phage0005787,phage2001833,phage2001951,phage0000272,phage0003712,phage0000820,phage2001139,phage2000769,phage0000176,phage2001909,phage0002456,phage0003050,phage0003449,phage0005543,phage0002905,phage0003356,phage0000294,phage0005653,phage0003454,phage0003730,phage0002635,phage0006204,phage2002345,phage0001671,phage2001155,phage2000260,phage0000063,phage2001110,phage2000426,phage0003000,phage0005785,phage0003729,phage0003732,phage0003042,phage0001396,phage0002656,phage0000882,phage0003289,phage0001794,phage0001030,phage2000119,phage2000460,phage0003726,phage2000532,phage0004185,phage0001112,phage2002101,phage0001105,phage2002431,phage2001538,phage0002589,phage0005100,phage0002998,phage0003442,phage0005567,phage0005542,phage0002153,phage0005775,phage2000032,phage2002467,phage0000980,phage2001876,phage0001547,phage2002199,phage2001879,phage0005173,phage2001873,phage2002087,phage0000088,phage0001955,phage0002202,phage2001416,phage0002237,phage2000036,phage0001480,phage0001214,phage2001935,phage2001147,phage0001136,phage2001803,phage1000707,phage2002120,phage0000693,phage2001269,phage0000439,phage0003076,phage0002383,phage0005156,phage2001181,phage0000938,phage0004186,phage2002223,phage2000740,phage0000271,phage0001168,phage0001092,phage2001547,phage2000310,phage0002272,phage0004878,phage0002631,phage2000131,phage2000932,phage0000186,phage0000244,phage0001942,phage0004187,phage0003351,phage2001285,phage0003245,phage2002237,phage0001111,phage0001954,phage0005973,phage2001500,phage2001323,phage2002245,phage0001117,phage0002352,phage2000486,phage2000939,phage2000566,phage0002351,phage0006632,phage0000686,phage2002125,phage2002331,phage0004851,phage0002233,phage0005637,phage2000771,phage2000657,phage2002385,phage2000244,phage2002480,phage2001891,phage0006073,phage2002252,phage2000709,phage0000507,phage0000620,phage0000491,phage0005182,phage0005181,phage2002007,phage0000499,phage2000489,phage2000993,phage0000556,phage0000213,phage0001466,phage2002025,phage2001023,phage2000943,phage2001516,phage2002086,phage0001263,phage0000544,phage0000513,phage2000122,phage2000141,phage0000559,phage2001837,phage0000569,phage0000474,phage2000551,phage2000764,phage0000478,phage2001244,phage2000721,phage0005189,phage2001960,phage2001929,phage0002309,phage0004852,phage2001280,phage0000489,phage0004847,phage0000532,phage2001692,phage0004995,phage2000106,phage0000567,phage2001052,phage0004175,phage0004798,phage0005878,phage2002371,phage2000798,phage2001548,phage2000156,phage2001034,phage0000183,phage0000067,phage0001681,phage2000393,phage2001444,phage0001682,phage2001186,phage0000688,phage2001558,phage0000534,phage0001680,phage2001701,phage2000281,phage0001746,phage0001747,phage2001365,phage0000576,phage0000588,phage0001679,phage0000662,phage0000660,phage0001290,phage0000516,phage2000173,phage0005823,phage0000557,phage2001963,phage2001885,phage0005184,phage2002205,phage2000720,phage0005003,phage2000268,phage2001457,phage0000931,phage0005977,phage2000685,phage0001957,phage0000599,phage2000220,phage0000084,phage0004841,phage0000607,phage2000439,phage0000241,phage2000505,phage2000570,phage2001595,phage0002341,phage0000522,phage2000906,phage0002157,phage0002199,phage2001043,phage2000936,phage2000879,phage0000932,phage2001518,phage0003341,phage2001233,phage0000497,phage2001066,phage0000657,phage2001195,phage0005576,phage2000717,phage2001650,phage0000933,phage0000175,phage2000832,phage0000928,phage0000930,phage0000523,phage0000486,phage0004194,phage2000292,phage0003832,phage2001007,phage2001524,phage2000152,phage2000350,phage0000615,phage0005937,phage0005639,phage0000756,phage0005727,phage2001463,phage2001379,phage2000837,phage2000557,phage0000666,phage0004447,phage0000305,phage0005166,phage2000017,phage0005435,phage0005424,phage0001772,phage2000977,phage0000506,phage0000324,phage0002353,phage2002344,phage0000423,phage0001344,phage0005174,phage0000233,phage0000339,phage0000717,phage0005429,phage2000586,phage0002902,phage0005370,phage0005183,phage0002286,phage0005897,phage0002251,phage0003184,phage0000155,phage0001514,phage0000344,phage0000498,phage0002050,phage0005393,phage0004510,phage0004697,phage2002201,phage0005427,phage1002035,phage0005860,phage0000964,phage2000218,phage2002382,phage0005803,phage0000685,phage2002295,phage0001474,phage2002058,phage2000967,phage0002826,phage0002824,phage2000187,phage2000430,phage0003920,phage2000420,phage0001446,phage0002213,phage2000421,phage2001926,phage2000506,phage2000241,phage0001241,phage0002220,phage0002423,phage0003188,phage0003997,phage2001428,phage0005912,phage0005769,phage2000537,phage0005997,phage0005987,phage0000859,phage0005109,phage0006651,phage0002007,phage0000426,phage0004212,phage0005235,phage2001844,phage0005959,phage2000788,phage2001497,phage2001135,phage0002462,phage2000502,phage0003765,phage0006340,phage0006541,phage0001998,phage2001897,phage0002375,phage1001005,phage2002023,phage0003097,phage2001727,phage2001747,phage0001592,phage0001600,phage0005409,phage0004918,phage0004251,phage0005436,phage0003231,phage0005831,phage0001973,phage0004181,phage0000464,phage2002062,phage0000698,phage0005349,phage0001784,phage0001326,phage1000447,phage0005432,phage0005280,phage0000229,phage0001546,phage0000942,phage0005711,phage0000447,phage0001700,phage0001543,phage0005433,phage0005237,phage0001569,phage0005347,phage0000156,phage0000343,phage0005234,phage0001790,phage2001130,phage0001630,phage2000481,phage0001016,phage0001386,phage0001793,phage0006553,phage0002339,phage0004467,phage0005565,phage0000996,phage0004679,phage0005337,phage1001004,phage0004074,phage0005731,phage0001641,phage0001580,phage0002263,phage0001000,phage0005233,phage0006451,phage0001019,phage0001631,phage0001008,phage0001009,phage0005925,phage0001635,phage0005984,phage0001014,phage0001644,phage0001010,phage0001577,phage0003985,phage0002001,phage0006157,phage0001575,phage0002290,phage0006136,phage0006141,phage0002959,phage0006140,phage0006143,phage0006147,phage0006151,phage0004476,phage0006163,phage0005116,phage0005774,phage0002968,phage0000044,phage0004475,phage0002973,phage0000043,phage0000027,phage0000026,phage0000034,phage0002977,phage0000048,phage0000038,phage0002954,phage0000411,phage0000049,phage0003091,phage0000682,phage0000046,phage0006527,phage0006009,phage0002986,phage0004665,phage0002258,phage0002607,phage0002277,phage0005807,phage0001521,phage0001534,phage0005980,phage0005920,phage0006270,phage0001941,phage0002298,phage0002355,phage0000300,phage0002544,phage0005310,phage0001448,phage2001978,phage2000024,phage2001010,phage2000980,phage0000883,phage2000898,phage0001212,phage0000884,phage2001202,phage2001990,phage0003056,phage1001002,phage1001006,phage0001441'
PHAGES = 'phage0000001'
CLUSTER_METHODS = ['needle_mcl']
# CLUSTER_METHODS = ['mcl']
# CLUSTER_METHODS = ['spectral']

###################################################################################################
# rules ###########################################################################################
###################################################################################################

rule run:
  input:
    expand('{data_dir}/011_{cluster_method}_cluster_annotations/Cluster_{i}.result',
            data_dir=DATA_DIR,
            cluster_method=CLUSTER_METHODS,
            i=CLUSTERS),

    expand('{data_dir}/015_{cluster_method}.{type}.{spec}.stats',
            data_dir=DATA_DIR,
            cluster_method=CLUSTER_METHODS,
            type='fs',
            spec=HOSTS.split(',')),

rule download:
  output:
    expand('{{data_dir}}/001_{{database}}.{suffix}', suffix=SUFFIXES)
  params:
    script_dir=SCRIPT_DIR
  log:
    '{data_dir}/001_{database}.stdout',
    '{data_dir}/001_{database}.stderr'
  shell:
    '''
    mkdir -p {wildcards.data_dir}
    {params.script_dir}/001_download_from_{wildcards.database}.py {wildcards.data_dir} > {log[0]} 2> {log[1]}
    '''

rule merge:
  input:
    expand('{{data_dir}}/001_{database}.{suffix}', database=DATABASES, suffix=SUFFIXES)
  output:
    '{data_dir}/002_merged.genes.conversion',
    '{data_dir}/002_merged.genes.fasta',
    '{data_dir}/002_merged.genomes.conversion',
    '{data_dir}/002_merged.genomes.fasta'
  shell:
    '''
    cat {wildcards.data_dir}/001_*.genes.conversion   > {output[0]}
    cat {wildcards.data_dir}/001_*.genes.fasta        > {output[1]}
    cat {wildcards.data_dir}/001_*.genomes.conversion > {output[2]}
    cat {wildcards.data_dir}/001_*.genomes.fasta      > {output[3]}
    '''

rule eliminate_duplicates:
  input:
    '{data_dir}/002_merged.genes.conversion',
    '{data_dir}/002_merged.genomes.conversion',
    '{data_dir}/002_merged.genomes.fasta'
  output:
    '{data_dir}/003_deduplicated.genes.conversion',
    '{data_dir}/003_deduplicated.genomes.conversion',
    '{data_dir}/003_deduplicated.genomes.fasta'
  params:
    script_dir=SCRIPT_DIR
  shell:
    '''
    {params.script_dir}/003_deduplicate_genomes.py {input[2]} {input[1]} {input[2]} {wildcards.data_dir}
    '''

rule annotate:
  input:
    '{data_dir}/003_deduplicated.genomes.fasta'
  output:
    '{data_dir}/004_PROKKA/genomes.gbk'
  threads:
    THREADS
  shell:
    '''
    prokka  --force                                                           \
            --cpus {threads}                                                  \
            --kingdom Viruses                                                 \
            --outdir {wildcards.data_dir}/004_PROKKA                          \
            --prefix genomes                                                  \
            {input[0]}
    '''

rule extract_genes:
  input:
    '{data_dir}/004_PROKKA/genomes.gbk'
  output:
    '{data_dir}/005_annotated.genes.conversion',
    '{data_dir}/005_annotated.genes.fasta'
  params:
    script_dir=SCRIPT_DIR,
    prefix='{data_dir}/005_annotated'
  shell:
    '''
    {params.script_dir}/005_extract_prokka_genes.py {input[0]}                \
                                                    {params.prefix}
    '''

rule count_hosts:
  input:
    '{data_dir}/003_deduplicated.genomes.conversion'
  output:
    '{data_dir}/004_hosts',
    '{data_dir}/004_hosts.counts'
  params:
    script_dir=SCRIPT_DIR
  shell:
    '''
    {params.script_dir}/101_create_host_string.py {input[0]} > {output[0]}
    sort -k 2 {output[0]} | cut -f2 | sed "s/\\(^.\\{{7\\}}\\).*/\\1/" | uniq -c | sort -nr > {output[1]}
    '''

rule split_dataset:
  input:
    '{data_dir}/003_deduplicated.genomes.conversion',
    '{data_dir}/003_deduplicated.genomes.fasta',
    '{data_dir}/005_annotated.genes.conversion',
    '{data_dir}/005_annotated.genes.fasta',
    '{data_dir}/004_hosts',
    '{data_dir}/004_hosts.counts'
  output:
    expand('{{data_dir}}/006_{set_name}.{suffix}', set_name=['train', 'test', 'other'], suffix=SUFFIXES)
  params:
    script_dir=SCRIPT_DIR,
    train_percentage=TRAIN_PERCENTAGE,
    train_num=TRAIN_NUM,
    hosts=HOSTS
  shell:
    '''
    if [ {params.hosts} = "" ]; then
        HOSTS=$(grep -v no_host {input[5]} | head -n {params.train_num} | tr -s " " | cut -f3 -d " " | tr "\n" ",")
        HOSTS=${{HOSTS::-1}}
    else
        HOSTS={params.hosts}
    fi

    {params.script_dir}/006_split_train_test_other.py {input[2]} {input[3]} {input[0]} {input[1]} {input[4]} \
                                                      ${{HOSTS}} {params.train_percentage}
    '''

rule cd_hit:
  input:
    '{data_dir}/006_train.genes.fasta'
  output:
    '{data_dir}/007_train.cd-hit.genes.fasta',
    '{data_dir}/007_train.cd-hit.genes.fasta.clstr'
  threads:
    THREADS
  shell:
    '''
    cd-hit -c 1 -s 1 -g 1 -d 0 -T {threads} -i {input[0]} -o {output[0]}
    '''

rule crocoblast:
  input:
    '{data_dir}/007_train.cd-hit.genes.fasta'
  output:
    '{data_dir}/008_crocoblast.abc'
  params:
    script_dir=SCRIPT_DIR
  threads:
    THREADS
  shell:
    '''
    rm -rf {wildcards.data_dir}/008_crocoblast/ {wildcards.data_dir}/008_crocoblast_database/
    cp -r  {params.script_dir}/crocoblast/ {wildcards.data_dir}/008_crocoblast/
    mkdir  {wildcards.data_dir}/008_crocoblast_database

    {wildcards.data_dir}/008_crocoblast/crocoblast  -add_database                       \
                                                      --sequence_file                   \
                                                        protein                         \
                                                        {input[0]}                      \
                                                        007_train.cd-hit.genes.fasta    \
                                                        {wildcards.data_dir}/008_crocoblast_database

    {wildcards.data_dir}/008_crocoblast/crocoblast  -add_to_queue                       \
                                                      blastp                            \
                                                      007_train.cd-hit.genes.fasta      \
                                                      {input[0]}                        \
                                                      {wildcards.data_dir}              \
                                                      --blast_options                   \
                                                        -outfmt 6                       \
                                                        -max_target_seqs 1000000        \
                                                        -max_hsps 1

    echo "{wildcards.data_dir}/008_crocoblast/crocoblast -run > /dev/null; touch {wildcards.data_dir}/008_finished"   \
    | qsub -l thr={threads} -cwd -N crocoblast

    while [ ! -f {wildcards.data_dir}/008_finished ]; do
        sleep 20m
    done

    mv {wildcards.data_dir}/CrocoBLAST_1 {wildcards.data_dir}/008_crocoblast_output
    cut -f 1,2,11 {wildcards.data_dir}/008_crocoblast_output/complete_assembled_output > {output[0]}
    '''

rule blast_to_needle:
  input:
    '{data_dir}/008_crocoblast.abc',
    '{data_dir}/006_train.genes.fasta'
  output:
    '{data_dir}/008_crocoblast.needle.abc'
  params:
    script_dir=SCRIPT_DIR,
    needle_dir=NEEDLE_DIR
  shell:
    '''
    rm -rf {params.needle_dir}
    mkdir  {params.needle_dir}

    {params.script_dir}/007_parallelize_global_alignment_from_blast.py  {input[0]}                \
                                                                        {input[1]}                \
                                                                        {params.needle_dir}

    while [ ! -f {params.needle_dir}/qsub_completed ]; do
        sleep 20m
    done;

    for i in $(seq 0 1 9); do
        cat {params.needle_dir}/tmp/${{i}}*.abc > {params.needle_dir}/tmp/${{i}}.final.abc        \
            && echo "{params.needle_dir}/tmp/${{i}}.final.abc created."                           \
            || echo "There are no files to merge into ${{i}}.final.abc."
    done;

    cat {params.needle_dir}/tmp/*.final.abc > {output[0]}
    '''

rule cluster_spectral:
  input:
    '{data_dir}/008_crocoblast.abc'
  output:
    '{data_dir}/009_spectral.clusters'
  shell:
    '''
    clusterx -t blast {input[0]} -o {output[0]}
    '''

rule cluster_mcl:
  input:
    '{data_dir}/008_crocoblast.abc'
  output:
    '{data_dir}/009_mcl.clusters'
  shell:
    '''
    clusterx -m mcl -p inflation=1.2 {input[0]} -o {output[0]}
    '''

rule cluster_needle_mcl:
  input:
    '{data_dir}/008_crocoblast.needle.abc'
  output:
    '{data_dir}/009_needle_mcl.clusters'
  shell:
    '''
    clusterx -m mcl -p inflation=1.2,dont_transform=True {input[0]} -o {output[0]}
    '''

rule create_cluster_file:
  input:
    '{data_dir}/009_{cluster_method}.clusters',
    '{data_dir}/007_train.cd-hit.genes.fasta.clstr'
  output:
    '{data_dir}/010_{cluster_method}.tsv'
  params:
    script_dir=SCRIPT_DIR
  shell:
    '''
    {params.script_dir}/010_create_cluster_file.py {input[0]} {input[1]} {output[0]}
    '''

rule create_matrix:
  input:
    '{data_dir}/010_{cluster_method}.tsv',
    '{data_dir}/006_train.genes.conversion',
    '{data_dir}/006_train.genomes.fasta'
  output:
    '{data_dir}/011_matrix.{cluster_method}.raw.tsv'
  params:
    script_dir=SCRIPT_DIR,
    genomes_list='{data_dir}/011_train.genomes.list'
  shadow:
    'shallow'
  shell:
    '''
    grep "^>" {input[2]} | tr -d ">" | sort > {params.genomes_list}

    {params.script_dir}/009_parallelize_matrix_creation_from_mcl.py {input[1]}                    \
                                                                    {input[0]}                    \
                                                                    {params.genomes_list}

    while [ $(ls {wildcards.data_dir}/matrix.part* | wc -l) -ne 11 ]; do
        sleep 1m
    done

    sleep 1m
    cat {wildcards.data_dir}/matrix.part* > {output[0]}
    rm {wildcards.data_dir}/matrix.part*
    '''

rule annotate_cluster:
  input:
    '{data_dir}/005_annotated.genes.fasta',
    '{data_dir}/010_{cluster_method}.tsv',
  output:
    '{data_dir}/011_{cluster_method}_cluster_annotations/Cluster_{i}.result'
  params:
    script_dir=SCRIPT_DIR,
    cluster_annotation_dir='{data_dir}/011_{cluster_method}_cluster_annotations'
  shell:
    '''
    mkdir -p {params.cluster_annotation_dir}

    {params.script_dir}/101_prepare_cluster_for_interpro.py {input[0]}                            \
                                                            {input[1]}                            \
                                                            {wildcards.i}                         \
                                                            > {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta

    cd-hit  -i {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta                  \
            -o {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta.cd-hit           \
            -c 1 -d 0
            # -T 16

    interproscan  -i {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta.cd-hit     \
                  -o {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta.cd-hit.tsv \
                  -f tsv -goterms
                  # -cpu 16

    cut -f 12,13 {params.cluster_annotation_dir}/Cluster_{wildcards.i}.genes.fasta.cd-hit.tsv     \
        | sort | uniq -c | sort -nr                                                               \
        > {params.cluster_annotation_dir}/Cluster_{wildcards.i}.result
    '''

rule select_features:
  input:
    '{data_dir}/011_matrix.{cluster_method}.raw.tsv'
  output:
    '{data_dir}/011_matrix.{cluster_method}.fs.tsv'
  params:
    script_dir=SCRIPT_DIR,
    variance_threshold=VARIANCE_THRESHOLD
  shell:
    '''
    {params.script_dir}/011_feature_selection.py {input[0]} {output[0]} {params.variance_threshold}
    '''

rule split_matrix:
  input:
    '{data_dir}/011_matrix.{cluster_method}.{type}.tsv',
    '{data_dir}/004_hosts'
  output:
    expand([
            '{{data_dir}}/012_matrix.{{cluster_method}}.{{type}}.{host}.tsv',
            '{{data_dir}}/012_hosts.{{cluster_method}}.{{type}}.{host}'
            ], host=HOSTS.split(',')),
  params:
    script_dir=SCRIPT_DIR,
    hosts=HOSTS
  shell:
    '''
    {params.script_dir}/012_split_matrix.py {input[0]} {input[1]} {params.hosts}
    '''

rule create_model:
  input:
    expand('{{data_dir}}/012_matrix.{{cluster_method}}.{{type}}.{host}.tsv', host=HOSTS.split(',')),
  output:
    '{data_dir}/013_model.{cluster_method}.{type}.{spec}.pkl',
    '{data_dir}/013_tree.{cluster_method}.{type}.{spec}.pdf'
  params:
    script_dir=SCRIPT_DIR
  shell:
    '''
    {params.script_dir}/013_decision_tree.py {wildcards.spec} {input}
    '''

rule split_test_fasta:
  input:
    '{data_dir}/006_test.genomes.fasta'
  output:
    '{data_dir}/014_classification/{phage}.genomes.fasta'
  params:
    classify_dir='{data_dir}/014_classification',
  shell:
    '''
    mkdir -p {params.classify_dir}

    grep -A 1 {wildcards.phage} {input[0]} > {output[0]}
    '''

rule get_genes:
  input:
    '{data_dir}/014_classification/{phage}.genomes.fasta'
  output:
    '{data_dir}/014_classification/{phage}.genes.fasta'
  params:
    script_dir=SCRIPT_DIR,
    classify_dir='{data_dir}/014_classification',
    prefix='{data_dir}/014_classification/{phage}'
  shell:
    '''
    prokka  --force                                                                               \
            --kingdom Viruses                                                                     \
            --outdir {params.classify_dir}                                                        \
            --prefix {wildcards.phage}                                                            \
            {input[0]}

    {params.script_dir}/005_extract_prokka_genes.py {params.classify_dir}/{wildcards.phage}.gbk   \
                                                    {params.prefix}
    '''

rule blast_genes:
  input:
    '{data_dir}/014_classification/{phage}.genes.fasta',
  output:
    '{data_dir}/014_classification/{phage}.genes.blast',
  params:
    database='{data_dir}/008_crocoblast_database/007_train.cd-hit.genes.fasta',
  shell:
    '''
    blastp  -query {input[0]}                                                                     \
            -db {params.database}                                                                 \
            -out {output[0]}                                                                      \
            -outfmt "6 qseqid sseqid score pident evalue"                                         \
            -max_target_seqs 1
    '''

rule classify:
  input:
    '{data_dir}/014_classification/{phage}.genes.blast',
    '{data_dir}/010_{cluster_method}.tsv',
    '{data_dir}/011_matrix.{cluster_method}.{type}.tsv',
    '{data_dir}/013_model.{cluster_method}.{type}.{spec}.pkl',
  output:
    '{data_dir}/014_classification/{phage}.{cluster_method}.{type}.{spec}.vector',
    '{data_dir}/014_classification/{phage}.{cluster_method}.{type}.{spec}.result',
  params:
    script_dir=SCRIPT_DIR,
  shell:
    '''
    {params.script_dir}/014_create_vector.py  {input[0]} {input[1]} {input[2]} {input[3]}         \
                                              {output[0]} {output[1]}
    '''

rule stats:
  input:
    results=expand('{{data_dir}}/014_classification/{phage}.{{cluster_method}}.{{type}}.{{spec}}.result',
                    phage=PHAGES.split(',')),
    hosts='{data_dir}/004_hosts'
  output:
    '{data_dir}/015_{cluster_method}.{type}.{spec}.results',
    '{data_dir}/015_{cluster_method}.{type}.{spec}.stats'
  params:
    script_dir=SCRIPT_DIR,
  shell:
    '''
    cat {input.results} > {output[0]}

    {params.script_dir}/015_calculate_stats.py {input.hosts} {output[0]} {output[1]}
    '''
