BLAST pipeline

This example splits a FASTA file into chunks and executes a BLAST query for each chunk in parallel. Then, all the sequences for the top hits are collected and merged into a single result file.

1
#!/usr/bin/env nextflow
2

3
/*
4
 * Defines the pipeline input parameters (with a default value for each one).
5
 * Each of the following parameters can be specified as command line options.
6
 */
7
params.query = "$baseDir/data/sample.fa"
8
params.db = "$baseDir/blast-db/pdb/tiny"
9
params.out = "result.txt"
10
params.chunkSize = 100
11

12
db_name = file(params.db).name
13
db_dir = file(params.db).parent
14

15

16
workflow {
17
    /*
18
     * Create a channel emitting the given query fasta file(s).
19
     * Split the file into chunks containing as many sequences as defined by the parameter 'chunkSize'.
20
     * Finally, assign the resulting channel to the variable 'ch_fasta'
21
     */
22
    Channel
23
        .fromPath(params.query)
24
        .splitFasta(by: params.chunkSize, file:true)
25
        .set { ch_fasta }
26

27
    /*
28
     * Execute a BLAST job for each chunk emitted by the 'ch_fasta' channel
29
     * and emit the resulting BLAST matches.
30
     */
31
    ch_hits = blast(ch_fasta, db_dir)
32

33
    /*
34
     * Each time a file emitted by the 'blast' process, an extract job is executed,
35
     * producing a file containing the matching sequences.
36
     */
37
    ch_sequences = extract(ch_hits, db_dir)
38

39
    /*
40
     * Collect all the sequences files into a single file
41
     * and print the resulting file contents when complete.
42
     */
43
    ch_sequences
44
        .collectFile(name: params.out)
45
        .view { file -> "matching sequences:\n ${file.text}" }
46
}
47

48

49
process blast {
50
    input:
51
    path 'query.fa'
52
    path db
53

54
    output:
55
    path 'top_hits'
56

57
    """
58
    blastp -db $db/$db_name -query query.fa -outfmt 6 > blast_result
59
    cat blast_result | head -n 10 | cut -f 2 > top_hits
60
    """
61
}
62

63

64
process extract {
65
    input:
66
    path 'top_hits'
67
    path db
68

69
    output:
70
    path 'sequences'
71

72
    """
73
    blastdbcmd -db $db/$db_name -entry_batch top_hits | head -n 10 > sequences
74
    """
75
}

Try it on your computer

To run this pipeline on your computer, you will need:

Unix-like operating system
Java 17 (or higher)
Docker

Install Nextflow by entering the following command in the terminal:

$ curl -fsSL https://get.nextflow.io | bash

Then launch the pipeline with this command:

$ ./nextflow run blast-example -with-docker

It will automatically download the pipeline GitHub repository and the associated Docker images, thus the first execution may take a few minutes to complete depending on your network connection.

NOTE: To run this example with versions of Nextflow older than 22.04.0, you must include the -dsl2 flag with nextflow run.