Useful Bioinformatics One-liners
Last updated on 2024-09-20
Some of these one-liners are from Stack Overflow, Stack Exchange, Biostar, etc. I can’t thank them, the people on these platforms, enough.
Note that some of the commands require gawk to be installed. If you’re using Ubuntu, use sudo apt install gawk
to install it.
All calculations, modifications, etc require singleline fasta so please convert your fasta into singleline format first.
Fastq
Use cat instead of zcat if files are not compressed.
- Get read length and the count number for each length.
zcat file.fq.gz | awk 'NR%4==2 {len[length($0)]++} END {for (i in len) {print i"\t"len[i]}}'
- Filter based on read length.
awk -v n=100 'NR%4==1 {header=$0} NR%4==2 {seq=$0} NR%4==0 {if (length(seq) > n) {print header "\n" seq "\n+\n" $0}}'
- Get the total size (Gb) and number of reads.
zcat file.fq.gz | awk 'NR%4==2 {sum+=length($0); count++} END {printf "size: %.3f\nnumber of reads: %d\n", sum/1e9, count}'
- Interleave paired-end reads.
paste <(zcat forward.fq.gz) <(zcat reverse.fq.gz) | paste - - - - | awk 'BEGIN {FS="\t"; OFS="\n"} {print $1, $3, $5, $7, $2, $4, $6, $8}'
zcat reverse.fq.gz | sed 'R /dev/stdin' <(zcat forward.fq.gz) | sed 'N; N; N; N; N; N; N; s/\n/\t/g' | awk 'BEGIN {FS="\t"; OFS="\n"} {print $1, $3, $5, $7, $2, $4, $6, $8}'
awk '{print} NR%4==0 {i=4; while (i>0) {"zcat reverse.fq.gz" | getline; print; i--}}' <(zcat forward.fq.gz)
- Deinterleave reads.
paste - - - - - - - - < interleaved.fq | tee >(cut -f 1-4 | tr "\t" "\n" > forward.fq) | cut -f 5-8 | tr "\t" "\n" > reverse.fq
cat interleaved.fq | awk '/^@(.+)* 1:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "forward.fq"} /^@(.+)* 2:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "reverse.fq"}'
- Deduplicate single-end reads.
zcat file.fq.gz | awk '/^@/ {getline seq; f=!a[seq]++} f'
- Deduplicate paired-end reads.
paste <(zcat forward.fq.gz) <(zcat reverse.fq.gz) | paste - - - - | awk 'BEGIN {FS=OFS="\t"} {print $1, $2"\n"$3, $4"\n"$5, $6"\n"$7, $8}' | awk '/^@/ {getline l1; f=!a[l1]++; getline l2; getline l3} f {print $0"\n"l1"\n"l2"\n"l3}' | paste - - - - | awk 'BEGIN {FS="\t"; OFS="\n"} {print $1, $3, $5, $7, $2, $4, $6, $8}' | awk '/^@(.+)* 1:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "dedup_forward.fq"} /^@(.+)* 2:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "dedup_reverse.fq"}'
- Deduplicate interleaved reads.
awk 'BEGIN {OFS="\n"} /^@(.+) 1(.+)/ {f1=$0; getline f2; getline f3; getline f4; next} /^@(.+) 2(.+)/ {r1=$0; getline r2; getline r3; getline r4} {f=!a[f2, r2]++} f {print f1, f2, f3, f4, r1, r2, r3, r4}' interleaved.fq
- Remove singletons in single-end reads.
zcat file.fq.gz | awk '/^@/ {getline l1; getline l2; getline l3; a[l1][$0"\n"l1"\n"l2"\n"l3]=b[l1]++} END {for (i in b) if (b[i]>1) c[i]; for (i in c) for (j in a[i]) print j}'
- Remove singletons in paired-end reads.
paste <(zcat forward.fq.gz) <(zcat reverse.fq.gz) | paste - - - - | awk 'BEGIN {FS=OFS="\t"} {print $1, $2"\n"$3, $4"\n"$5, $6"\n"$7, $8}' | awk '/^@/ {getline l1; getline l2; getline l3; a[l1][$0"\n"l1"\n"l2"\n"l3]=b[l1]++} END {for (i in b) if (b[i]>1) c[i]; for (i in c) for (j in a[i]) print j}' | paste - - - - | awk 'BEGIN {FS="\t"; OFS="\n"} {print $1, $3, $5, $7, $2, $4, $6, $8}' | awk '/^@(.+)* 1:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "multiton_forward.fq"} /^@(.+)* 2:/ {j=$0; i=4; while (i>1) {i--; getline l; j=j ORS l}; print j > "multiton_reverse.fq"}'
- Remove singletons in interleaved reads.
awk 'BEGIN {OFS="\n"} /^@(.+) 1(.+)/ {f1=$0; getline f2; getline f3; getline f4; next} /^@(.+) 2(.+)/ {r1=$0; getline r2; getline r3; getline r4} {a[f2, r2][f1"\n"f2"\n"f3"\n"f4"\n"r1"\n"r2"\n"r3"\n"r4]=b[f2, r2]++} END {for (i in b) if (b[i]>1) c[i]; for (i in c) for (j in a[i]) print j}' interleaved.fq
- Trim single-end reads and print out summary of reads. Here I set the score threshold at 20 and minimum length at 30.
zcat file.fq.gz | awk -v score=20 -v len=30 'BEGIN {OFS="\n"; for (i=33; i<=73; i++) {if (score==j++) {score=sprintf("%c", i); break}}} /^@/ {getline seq; getline extra; getline qual; total_bp+=length(seq); total_reads++; if (match(qual, score)) {seq=substr(seq, 1, RSTART-1); qual=substr(qual, 1, RSTART-1)}; if (length(seq)>=len) {print $0, seq, extra, qual > "trimmed.fq"; trimmed_bp+=length(seq); trimmed_reads++}} END {printf "total reads:\t%s\ntotal basepairs:\t%s\ntrimmed reads:\t%s\ntrimmed basepairs:\t%s\n", total_reads, total_bp, trimmed_reads, trimmed_bp}'
- Trim paired-end reads.
paste <(zcat forward.fq.gz) <(zcat reverse.fq.gz) | paste - - - - | awk 'BEGIN {FS=OFS="\t"} {print $1, $2"\n"$3, $4"\n"$5, $6"\n"$7, $8}' | awk -v score=20 -v len=30 'BEGIN {OFS="\n"; for (i=33; i<=73; i++) {if (score==j++) {score=sprintf("%c", i); break}}} /^@/ {getline seq; getline extra; getline qual; split($0, a, "\t"); split(seq, b, "\t"); split(extra, c, "\t"); split(qual, d, "\t"); total_reads++; total_bp+=(length(b[1])+length(b[2])); if (match(d[1], score)) {b[1]=substr(b[1], 1, RSTART-1); d[1]=substr(d[1], 1, RSTART-1)}; if (match(d[2], score)) {b[2]=substr(b[2], 1, RSTART-1); d[2]=substr(d[2], 1, RSTART-1)}; if (length(b[1])>=len && length(b[2])>=len) {print a[1], b[1], c[1], d[1] > "trimmed_forward.fq"; print a[2], b[2], c[2], d[2] > "trimmed_reverse.fq"; trimmed_reads++; trimmed_bp+=(length(b[1])+length(b[2]))}} END {printf "total reads:\t%s\ntotal basepairs:\t%s\ntrimmed reads:\t%s\ntrimmed basepairs:\t%s\n", total_reads*2, total_bp, trimmed_reads*2, trimmed_bp}'
- Trim interleaved reads.
awk -v score=20 -v len=30 'BEGIN {OFS="\n"; for (i=33; i<=73; i++) {if (score==j++) {score=sprintf("%c", i); break}}} /^@(.+) 1(.+)/ {f1=$0; getline f2; getline f3; getline f4; next} /^@(.+) 2(.+)/ {r1=$0; getline r2; getline r3; getline r4} {total_reads++; total_bp+=(length(f2)+length(r2)); if (match(f4, score)) {f2=substr(f2, 1, RSTART-1); f4=substr(f4, 1, RSTART-1)}; if (match(r4, score)) {r2=substr(r2, 1, RSTART-1); r4=substr(r4, 1, RSTART-1)}; if (length(f2)>=len && length(r2)>=len) {print f1, f2, f3, f4, r1, r2, r3, r4 > "trimmed_interleaved.fq"; trimmed_reads++; trimmed_bp+=(length(f2)+length(r2))}} END {printf "total reads:\t%s\ntotal basepairs:\t%s\ntrimmed reads:\t%s\ntrimmed basepairs:\t%s\n", total_reads*2, total_bp, trimmed_reads*2, trimmed_bp}' interleaved.fq
- Check average, max, and min read quality of each base location. Set Phred for the desired Phred score.
awk -v phred=33 -l ordchr 'BEGIN {OFS="\t"} NR%4==0 {for (i=1; i<=length($0); i++) {score=ord(substr($0, i, 1)) - phred; sum[i]+=score; min[i]=(min[i]>score || !min[i] ? score : min[i]); max[i]=(max[i]<score ? score : max[i]); count[i]++}} END {for (i in count) {print i, sum[i]/count[i], min[i], max[i], count[i]}}' file.fq
- Convert fastq to fasta.
sed -n '1~4s/^@/>/p; 2~4p' input.fastq > output.fasta
sed -e '/^@/!d; s//>/; N' input.fastq > output.fasta
awk 'NR%4==1 {sub(/^@/, ">", $0); print; getline; print}' input.fastq > output.fasta
- Split a fastq file into multiple files with approximately equal number of sequences. Here, I split the file into 4 files.
awk -v n=4 '{split(FILENAME, file, "."); count++} (NR%4==1) {a[count]=$0; for (i=1; i<=3; i++) {getline; a[count]=a[count] "\n" $0}} END {for (i=1; i<=n; i++) {for (j=i; j<=count; j+=n) {print a[j] > file[1] i "." file[2]}}}' file.fq
- Convert fastq to unmapped SAM and BAM.
awk '{ORS=(NR%4==0 ? "\n" : "\t")} 1' <(zcat file.fq.gz) | awk 'BEGIN {FS=OFS="\t"} {match($1, /@(.*)[ |[:alpha:]]/, a); gsub(/^@/, "", $1); print $1, length($2), "*\t0\t0\t*\t*\t0\t0", $2, $4, "RG:Z:" a[1]}' | samtools view -Sb - > file.bam
Use xargs for parallel processing.
In this example, I used xargs to handle the deduplication and conversion of multiple paired-end reads fastq into interleaved-reads fasta.
find . -maxdepth 1 -type f -name "*_1.fastq.gz" -print0 | sed 's/_1.fastq.gz//g' | xargs -0 -P 4 -I {} bash -c 'name=$(basename {}); paste <(zcat ${name}_1.fastq.gz) <(zcat ${name}_2.fastq.gz) | paste - - - - | awk '\''BEGIN {FS=OFS="\t"} {print $1, $2"\n"$3, $4}'\'' | awk '\''/^@/ {getline l; f=!a[l]++} f {print $0"\n"l}'\'' | paste - - | awk '\''BEGIN {FS="\t"; OFS="\n"} {print $1, $3, $2, $4}'\'' | sed '\''s/^@/>/g'\'' > interleaved_${name}.fasta'
Fasta
- Format multiline fasta to singleline fasta.
sed ':a; />/!N; s/\n//; ta; s/>/\n>/; s/^\s*//' file.fa
awk '/^>/ {printf "%s%s\n", (NR>1 ? "\n" : ""), $0; next} {printf "%s", toupper($0)} ENDFILE {printf "\n"}' file.fa
awk '/^>/ {if (seq) {print seq}; seq=""; print; next} {seq=seq $0} END {if (seq) {print seq}}' file.fa
awk 'BEGIN {ORS=""} {if ($0 ~ /^>/) {printf "%s%s%s", (NR==1 ? "" : "\n"), $0, "\n"} else {print $0}}' file.fa
- If the fasta file is copy-and-paste from the web browser, please run this command before any processing.
sed 's/\r//g' file.fa
- Format singleline fasta to multiline fasta. Here I used the limit of 60 characters per line, set l to the desired number of characters per line.
awk -v width=60 'BEGIN {ORS=""} /^>/ {print $0 "\n"; next} {line=line $0; while (length(line)>=width) {print substr(line, 1, width) "\n"; line=substr(line, width+1)}} END {if (length(line)) print line "\n"}' file.fa
awk -v width=10 'BEGIN {ORS=""} /^>/ {if (length(line)) {print line "\n"; line=""} print $0 "\n"; next} {line=line $0; while (length(line)>=width) {print substr(line, 1, width) "\n"; line=substr(line, width+1)}} END {if (length(line)) print line "\n"}' file.fa
- Split a fasta file into multiple files with approximately equal number of sequences. Here, I split the file into 4 files.
awk -v n=4 '{split(FILENAME, file, "."); count++} /^>/ {getline seq; a[count]=$0"\n"seq} END {for (i=1; i<=n; i++) {for (j=i; j<=count; j+=n) {print a[j] > file[1] i "." file[2]}}}' file.fa
- Rename sequence header of interleaved fastq-converted-to-fasta with leading power of 10. This helps when one uses BLAST or any classifier to classify reads.
awk 'BEGIN {OFS="\n"; count=0} /^>/ {getline seq; match($0, />(.+)* /, name); if (b[name[1]]++) {a[count++][name[1]"_R"]=seq} else {a[count][name[1]"_F"]=seq}} END {label=10^length(count); for (i in a) {for (j in a[i]) {split(j, c, "_"); print ">"label+i"_"c[2], a[i][j]}}}' file.fa
- Rename sequence header of fasta with leading power of 10.
awk '/^>/ {getline seq; a[count++][$0]=seq} END {label=10^length(count); for (i in a) for (j in a[i]) print ">"label+i"\n"a[i][j]}' file.fa
- Rename sequence header of fasta with leading power of 10 and create a table of indices.
awk '/^>/ {getline seq; a[count++]=$0"\n"seq} END {match(FILENAME, /^(.+)\./, fname); match(FILENAME, /([^\.]*$)/, fext); label=10^length(count); for (i in a) {split(a[i], b, "\n"); print ">"label+i"\n"b[2] > "renamed_" fname[1] "." fext[1]; gsub(/^>/, "", b[1]); print label+i "\t" b[1] > "indices_" fname[1] ".tsv"}}' file.fa
- Get the length of each sequence.
awk '/^>/ {getline seq; sub(/^>/, "", $0); print $0"\t"length(seq)}' file.fa
- Get the total size (Mb) and number of sequences.
awk '/^>/ {getline seq; sum+=length(seq); count++} END {printf "%s\t%.3f\t%d\n", FILENAME, sum/1e6, count}' file.fa
- Filter sequence based on sequence length (set n to the desired length).
awk -v n=1000 '/^>/ {getline seq} length(seq)>n {print $0"\n"seq}' file.fa
- Get N50, L50, and auN (area under the Nx curve, a new metric to evaluate assembly quality). Here I set n=0.5 to find the N50.
awk -v n=0.5 '/^>/ {getline seq; a[$0]=length(seq)} END {asort(a); for (i in a) {sum+=a[i]; sum_sq+=(a[i]**2); len[i]=a[i]}; auN=sum_sq/sum; for (j in len) {csum+=len[j]; if (csum>sum*(1-n)) {printf "N%d: %d\tL%d: %d\tauN: %.2f\n", n*100, len[j], n*100, j, auN; break}}}' file.fa
- Get the GC content of all the sequences.
awk '/^>/ {getline seq; total_len+=length(seq); gsub(/[AaTt]/, "", seq); gc_len+=length(seq)} END {printf "%s\t%.3f\n", FILENAME, gc_len*100/total_len}' file.fa
- Get the GC content of each sequence.
awk '/^>/ {getline seq; sub(/^>/, "", $0); len=length(seq); at_len=gsub(/[AaTt]/, "", seq); printf "%s\t%.3f\n", $0, (len-at_len)*100/len}' file.fa
- Find the shortest, longest, and average length of all sequences.
awk '/^>/ {getline seq; sum+=a[n++]=length(seq)} END {asort(a); printf "Min: %d\tMax: %d\tMean: %d\n", a[1], a[length(a)], sum/n}' file.fa
- Find the sequence(s) with the shortest and longest length.
awk '/^>/ {getline seq; sub(/^>/, "", $0); a[$0]=length(seq)} END {asort(a, b); min=b[1]; max=b[length(b)]; min_lst=max_lst=""; for (i in a) {if (a[i]==min) {min_lst=min_lst i " "}; if (a[i]==max) {max_lst=max_lst i " "}}; printf "Min: %d\t%s\nMax: %d\t%s\n", min, min_lst, max, max_lst}' file.fa
- Extract a region of a sequence e.g., a gene from a contig (replace “” for id, n for the starting position and m for the end position accordingly, following 1-based indexing).
awk -v id="" -v start=n -v end=m -v rc="" '
function revcomp(s) {
c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o="";
for (i=length(s); i>0; i--) {o=o c[substr(s, i, 1)]};
return o
}
($0~">"id) {
getline seq;
split(seq, s, "");
j=s[start];
for (i=start+1; i<=end; i++) {
j=j s[i]
};
if (rc) {
print $0 "\n" revcomp(j)
} else {
print $0 "\n" j
}
}' file.fa
- Locate a region of a sequence (set query with a desired pattern, accept regex).
awk -v query="<pattern>" '
function revcomp(s) {
c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o="";
for (i=length(s); i>0; i--) {o=o c[substr(s, i, 1)]};
return o
}
function genregex(str) {
hash["A"]="A"; hash["T"]="T"; hash["G"]="G"; hash["C"]="C";
hash["R"]="[AG]"; hash["Y"]="[CT]"; hash["S"]="[GC]"; hash["W"]="[AT]"; hash["K"]="[GT]"; hash["M"]="[AC]";
hash["B"]="[CGT]"; hash["D"]="[AGT]"; hash["H"]="[ACT]"; hash["V"]="[ACG]";
hash["N"]="[ATGC]";
regex="";
for (i=1; i<=length(str); i++) {if (hash[substr(str, i, 1)]) {regex=regex hash[substr(str, i, 1)]} else {regex=regex substr(str, i, 1)}};
return regex
}
function recwrap(str1, query1) {
pos=""; end=0;
return recfunc(str1, query1)
}
function recfunc(str2, query2) {
if (match(str2, query2)!=0) {
start=end+RSTART; end=end+RSTART+RLENGTH-1; pos=pos (pos=="" ? "" : " ") start ".." end;
recfunc(substr(str2, RSTART+RLENGTH, length(str2)), query2)};
return pos
} /^>/ {getline seq; gsub(/^>/, "", $0); loc=recwrap(seq, genregex(query)); if (loc!="") {print $0 "\t" loc "\t1"} else {loc=recwrap(seq, revcomp(genregex(query))); if (loc!="") {split(loc, revloc, /\.\./); print $0 "\t" length(seq)-revloc[2]+1 ".." length(seq)-revloc[1]+1 "\t-1"}}}' file.fa
- Get the reverse complement of each sequence.
awk 'function revcomp(s) {o=""; cmd="printf \"%s\" " s "| tr \"ATGC\" \"TACG\" | rev"; while ((cmd | getline o)>0) {}; close(cmd); return o} /^>/ {getline seq; print $0"\n"revcomp(seq)}' file.fa
awk 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (i=length(s); i>0; i--) {o=o c[substr(s, i, 1)]}; return o} /^>/ {getline seq; print $0"\n"revcomp(seq)}' file.fa
- Deduplicate sequences based on their header.
awk '/^>/ {f=!a[$0]++} f' file.fa
- Deduplicate sequences based on the sequences themselves.
awk '/^>/ {getline seq; f=!a[seq]++} f {print $0"\n"seq}' file.fa
- Deduplicate and remove sequences that match other longer sequences.
awk '
/^>/ {
getline sequence
count++
headers[count] = $0
sequences[count] = sequence
lengths[count] = length(sequence)
}
END {
for (i = 1; i <= count; i++) {
for (j = i+1; j <= count; j++) {
if (lengths[i] > lengths[j]) {
tmp = lengths[i]; lengths[i] = lengths[j]; lengths[j] = tmp
tmp = sequences[i]; sequences[i] = sequences[j]; sequences[j] = tmp
tmp = headers[i]; headers[i] = headers[j]; headers[j] = tmp
}
}
}
for (i = 1; i <= count; i++) {
deduped = 1
for (j = i + 1; j <= count; j++) {
if (index(sequences[j], sequences[i]) > 0) {
deduped = 0 # Mark as duplicate
break
}
}
if (deduped) {
print headers[i]
print sequences[i]
}
}
}' file.fa
- Remove singletons.
awk '/^>/ {getline seq; a[seq][$0"\n"seq]=b[seq]++} END {for (i in b) if (b[i]>1) c[i]; for (i in c) for (j in a[i]) print j}' file.fa
- Find sequences based on header between a sequence with patternA to a sequence with patternB.
awk '/^>patternA/ {f=1} /^>patternB/ {f=0} f' file.fa
- Map sequences.
awk '
function generate_regex(seq, regex, hash, i , nu) {
regex = ""
hash["A"] = "A"; hash["C"] = "C"; hash["G"] = "G"; hash["T"] = "T"
hash["R"] = "[AG]"; hash["Y"] = "[CT]"; hash["S"] = "[GC]"; hash["W"] = "[AT]"
hash["K"] = "[GT]"; hash["M"] = "[AC]"; hash["B"] = "[CGT]"; hash["D"] = "[AGT]"
hash["H"] = "[ACT]"; hash["V"] = "[ACG]"; hash["N"] = "[ACGT]"
for (i = 1; i <= length(seq); i++) {
nu = substr(seq, i, 1)
regex = regex "" hash[nu]
}
return regex
}
function find_match(seq1, seq2, min, max, query, reference, i, sub_reference, query_pattern, sub_reference_pattern) {
if (length(seq1) > length(seq2)) {
min = length(seq2)
max = length(seq1)
query = seq2
reference = seq1
} else {
min = length(seq1)
max = length(seq2)
query = seq1
reference = seq2
}
for (i = 1; i <= max - min; i++) {
sub_reference = substr(reference, i, min)
if (length(query) == length(sub_reference) && length(query) == min) {
query_pattern = generate_regex(query)
sub_reference_pattern = generate_regex(sub_reference)
if (query ~ sub_reference_pattern || sub_reference ~ query_pattern) {
# Print query
printf "%s\t", query
# Highlight the match region in red
printf "%s", substr(reference, 1, i - 1)
printf "\033[31m%s\033[0m", substr(reference, i, min)
printf "%s\t", substr(reference, i + min)
# Print location
printf "%d..%d\n", i, i + min - 1
}
}
}
}
FNR==NR {if (/^>/) {getline seq; a[$0]=seq}; next} /^>/ {getline seq; b[$0]=seq} END {for (i in a) {for (j in b) {find_match(a[i], b[j])}}}
' file1.fa file2.fa
- Remove tandem repeats. Repeats of length from 2 to 6 are removed.
awk -v k=6 'BEGIN {split("ATGC", a, ""); split("ATGC", b, ""); while (i<k-1) {i++; temp=""; for (m in a) {for (n in b) {temp=temp (temp ? " " : "") a[m] b[n]; c[i*(m*length(b)+n+1)]=a[m] b[n]}}; split(temp, b, " ")}; delete a; delete b} /^>/ {getline seq; a[$0]=seq} END {for (i in a) {for (j in c) {l=(length(c[j])>3 ? 4 : 8); regex="("c[j]"){"l",}"; if (match(a[i], regex)) {gsub(regex, "", a[i])}}; print i"\n"a[i]}}' file.fa
- Delineate sequences with ambiguity (adopted from the idea of James Curtis-Smith - a member of Perl community).
Nucleotides:
awk 'function delineate(id, s) {hash["R"]="A G"; hash["Y"]="C T"; hash["S"]="G C"; hash["W"]="A T"; hash["K"]="G T"; hash["M"]="A C"; hash["B"]="C G T"; hash["D"]="A G T"; hash["H"]="A C T"; hash["V"]="A C G"; hash["N"]="A T G C"; solutions=1; for (i in hash) {n=split(hash[i], a, " "); solutions*=entries[++j]=n; keys[j]=i; for (k=0; k<n; k++) {list[i][k]=a[k+1]}}; for (i=1; i<=solutions; i++) {idx=i; for (j=1; j<=length(entries); j++) {pairs[i][j]=keys[j] " " list[keys[j]][idx%entries[j]]; idx=int(idx/entries[j])}}; for (i in pairs) {tmp=s; for (j in pairs[i]) {split(pairs[i][j], re, " "); gsub(re[1], re[2], tmp)}; res[tmp]}; for (i in res) {seen[id][i]++}; delete res; for (i in seen) {count=1; for (j in seen[i]) print i "_" count++ "\n" j}; delete seen} BEGIN {IGNORECASE=1} /^>/ {getline seq; delineate($0, seq)}' file.fna
Input:
>Sequence_1
BAT
>Sequence_2
CGYN
Output:
>Sequence_1_1
CAT
>Sequence_1_2
TAT
>Sequence_1_3
GAT
>Sequence_2_1
CGTT
>Sequence_2_2
CGCA
>Sequence_2_3
CGTC
>Sequence_2_4
CGCG
awk 'function delineate(id, s) {hash["A"]="A"; hash["T"]="T"; hash["G"]="G"; hash["C"]="C"; hash["R"]="A G"; hash["Y"]="C T"; hash["S"]="G C"; hash["W"]="A T"; hash["K"]="G T"; hash["M"]="A C"; hash["B"]="C G T"; hash["D"]="A G T"; hash["H"]="A C T"; hash["V"]="A C G"; hash["N"]="A T G C"; solutions=1; tmp=s; m=split(tmp, a, ""); for (i=1; i<=m; i++) {n=split(hash[a[i]], b, " "); solutions*=n; for (j=0; j<n; j++) {list[i][j]=b[j+1]}}; for (i=1; i<=solutions; i++) {tmp=""; idx=i; for (j=1; j<=m; j++) {tmp=tmp list[j][idx%length(list[j])]; idx=int(idx/length(list[j]))}; print id "_" i "\n" tmp}} /^>/ {getline seq; delineate($0, seq)}' file.fna
Amino acids:
awk 'function delineate(id, s) {tmp=s; while (match(tmp, /(\[[^]]*\])/, key)) {tmp=substr(tmp, RSTART+RLENGTH); value=key[1]; gsub(/[][]/, "\\\\&", key[1]); gsub(/[][]/, "", value); gsub(/./, "& ", value); hash[key[1]]=value}; solutions=1; for (i in hash) {n=split(hash[i], a, " "); solutions*=entries[++j]=n; keys[j]=i; for (k=0; k<n; k++) {list[i][k]=a[k+1]}}; for (i=1; i<=solutions; i++) {idx=i; for (j=1; j<=length(entries); j++) {pairs[i][j]=keys[j] " " list[keys[j]][idx%entries[j]]; idx=int(idx/entries[j])}}; for (i in pairs) {tmp=s; for (j in pairs[i]) {split(pairs[i][j], re, " "); gsub(re[1], re[2], tmp)}; res[tmp]}; for (i in res) {seen[id][i]++}; delete res; for (i in seen) {count=1; for (j in seen[i]) print i "_" count++ "\n" j}; delete seen} BEGIN {IGNORECASE=1} /^>/ {getline seq; delineate($0, seq)}' file.faa
Input:
>Sequence_1
A[BC]T[KM]G
Output:
>Sequence_1_1
ABTKG
>Sequence_1_2
ACTKG
>Sequence_1_3
ABTMG
>Sequence_1_4
ACTMG
- Reverse translate amino acid sequence to nucleotide sequence in compressed format (please refer to the Standard Ambiguity Codes).
awk 'function reverse_translate(id, s) {
hash["W"] = "TGG"; hash["Y"] = "TAY"; hash["C"] = "TGY"; hash["E"] = "GAR";
hash["K"] = "AAR"; hash["Q"] = "CAR"; hash["S"] = "WSN"; hash["L"] = "YTN";
hash["R"] = "MGN"; hash["G"] = "GGN"; hash["F"] = "TTY"; hash["D"] = "GAY";
hash["H"] = "CAY"; hash["N"] = "AAY"; hash["M"] = "ATG"; hash["A"] = "GCN";
hash["P"] = "CCN"; hash["T"] = "ACN"; hash["V"] = "GTN"; hash["I"] = "ATH";
hash["*"] = "TRR";
solutions=1
for (i in hash) {
n = split(hash[i], a, " ")
solutions *= entries[++j] = n
keys[j] = i
for (k = 0; k < n; k++) {
list[i][k] = a[k+1]
}
}
for (i = 1; i <= solutions; i++) {
idx = i
for (j = 1; j <= length(entries); j++) {
pairs[i][j] = keys[j] " " list[keys[j]][idx % entries[j]]
idx = int(idx/entries[j])
}
}
count = 0
for (i in pairs) {
count++
for (j in pairs[i]) {
split(pairs[i][j], re, " ")
for (m = 1; m <= length(s); m++) {
each = substr(s, m, 1)
if (gsub(re[1], re[2], each)) {
res[count][m] = each
}
}
}
}
for (i in res) {
tmp=""
for (j in res[i]) {
tmp=tmp "" res[i][j]
}
seen[id][tmp]++
}
delete res
for (i in seen) {
count=1
for (j in seen[i]) {
print i (length(seen[i]) > 1 ? "_" count++ : "") "\n" j
}
}
delete seen
}
BEGIN {IGNORECASE = 1}
/^>/ {
getline seq
reverse_translate($0, seq)
}' file.faa
Input:
>Sequence_1
MARS
>Sequence_2
MEFTV
Output:
>Sequence_1_1
ATGGCNMGRAGY
>Sequence_1_2
ATGGCNMGRTCN
>Sequence_1_3
ATGGCNCGNAGY
>Sequence_1_4
ATGGCNCGNTCN
>Sequence_2
ATGGARTTYACNGTN
- Generate consensus sequence of sequences of same length.
awk 'BEGIN {
hash["A"] = "A"; hash["T"] = "T"; hash["G"] = "G"; hash["C"] = "C";
hash["CG"] = "S"; hash["AT"] = "W"; hash["AG"] = "R"; hash["CT"] = "Y";
hash["GT"] = "K"; hash["AC"] = "M"; hash["CGT"] = "B"; hash["ACG"] = "V";
hash["ACT"] = "H"; hash["AGT"] = "D"; hash["ACGT"] = "N";
}
/^>/ {
getline seq
a[++i] = seq
}
END {
len = length(a[1])
res = ""
for (i = 1; i <= len; i++) {
tmp = ""
for (j in a) {
split(a[j], b, "")
if (tmp !~ b[i]) {
tmp = tmp b[i]
}
}
sorted = ""
split(tmp, chars, "")
asort(chars)
for (k in chars) {
sorted = sorted chars[k]
}
res = res hash[sorted]
}
print res
}' file.fa
- Evaluate primers (replace
GCAN
with desired primer sequence). This one-liner accepts Standard Ambiguity Codes.
awk -v primer="GCAN" 'function recwrap(str1, query1) {pos=""; end=0; return recfunc(str1, query1)} function recfunc(str2, query2) {if (match(str2, query2)!=0) {start=end+RSTART; end=end+RSTART+RLENGTH-1; pos=pos (pos=="" ? "" : " ") "[" start "," end "]"; recfunc(substr(str2, RSTART+RLENGTH, length(str2)), query2)}; return pos} function primerregex(s) {hash["R"]="A G"; hash["Y"]="C T"; hash["S"]="G C"; hash["W"]="A T"; hash["K"]="G T"; hash["M"]="A C"; hash["B"]="C G T"; hash["D"]="A G T"; hash["H"]="A C T"; hash["V"]="A C G"; hash["N"]="A T G C"; tmp=s; m=split(tmp, a, ""); res=""; for (i=1; i<=m; i++) {gsub(/ /, "", hash[a[i]]); pos=(hash[a[i]]!="" ? "[" hash[a[i]] "]" : a[i]); res=res pos}; return res} BEGIN {IGNORECASE=1} /^>/ {getline seq; gsub(/^>/, "", $0); primer=primerregex(primer); loc=recwrap(seq, primer); if (loc!="") {print $0 "\t" original "\t" loc}}' file.fna
- Extract flanking region with forward primer and reverse primer.
awk -v fp="GCA" -v rp="AAN" 'function recwrap(str1, query1) {pos=""; end=0; return recfunc(str1, query1)} function recfunc(str2, query2) {if (match(str2, query2)!=0) {start=end+RSTART; end=end+RSTART+RLENGTH-1; pos=pos (pos=="" ? "" : " ") "[" start "," end "]"; recfunc(substr(str2, RSTART+RLENGTH, length(str2)), query2)}; return pos} function revcomp(str) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (k=length(str); k>0; k--) {o=o c[substr(str, k, 1)]}; return o} function primerregex(s) {hash["R"]="A G"; hash["Y"]="C T"; hash["S"]="G C"; hash["W"]="A T"; hash["K"]="G T"; hash["M"]="A C"; hash["B"]="C G T"; hash["D"]="A G T"; hash["H"]="A C T"; hash["V"]="A C G"; hash["N"]="A T G C"; tmp=s; m=split(tmp, a, ""); res=""; for (i=1; i<=m; i++) {gsub(/ /, "", hash[a[i]]); pos=(hash[a[i]]!="" ? "[" hash[a[i]] "]" : a[i]); res=res pos}; return res} BEGIN {IGNORECASE=1} /^>/ {getline seq; fp=primerregex(fp); rp=primerregex(rp); floc=recwrap(seq, fp); rloc=recwrap(revcomp(seq), rp); split(floc, a, " "); split(rloc, b, " "); for (i in a) {for (j in b) {match(a[i], /,(.+)\]/, fp_loc); match(b[j], /\[(.+),/, rv_loc); rv_loc[1]=length(seq)-rv_loc[1]-1; print $0 " [" fp_loc[1]+1 "," rv_loc[1]-1 "]" "\n" substr(seq, fp_loc[1]+1, rv_loc[1]-1-fp_loc[1])}}}' <(printf ">Sequence\nGCAGCAACGTACGTTTTGCTTTACT")
- Convert genbank format to fasta format.
awk '/ACCESSION/ {match($0, / +(.*)/, id)} /SOURCE/ {match($0, / +(.*)/, src)} /ORIGIN/ {seq=""; while ((getline line)>0) {if (line ~ /\/\//) {break} else {gsub(/[0-9]+/, "", line); gsub(/\r/, "", line); seq=seq toupper(line)}}; gsub(/ /, "", seq)} END {print ">" (id[1] ~ /[[:alnum:]]/ ? id[1] "_" src[1] : src[1]) "\n" seq}' file.gb
-
Extract feature(s) from genbank and convert to fasta.
-
from Ed Morton and vgersh99:
awk -v width=80 'function collect(array, tag, value) {if (key=="CDS") {while (match(record, /\/([^=]+)=(\S+|"[^"]+")/, tmp)) {tag=tmp[1]; value=tmp[2]; gsub(/^"|"$/, "", value); array[tag]=value; record=substr(record, RSTART+RLENGTH)}; if (width) {gsub(/\s+/,"", array["translation"]); gsub(".{" width "}", "&" RS, array["translation"]); sub(RS "$", "", array["translation"])} else {gsub(/\s+/, "", array["translation"])}; print ">" array["protein_id"], array["db_xref"], array["locus_tag"], array["product"]; print array["translation"]}; record=""} (NF==2 && !check && !/"/) {collect(); key=$1; sub(/\s*\$+/, "")} {if (check) {check=(sub(/"/, "&") ? 0 : check)} else {check=(n=gsub(/"/, "&") ? n%2 : 0); gsub(/^\s+|\s+$/, ""); record=(record=="" ? "" : record " ") $0}} END {collect()}' file.gb
-
general approach:
awk 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (i=length(s); i>0; i--) {o=o c[substr(s, i, 1)]}; return o} /.*[0-9]+.*\.\..*[0-9]+.*/ {split($0, a, " "); gsub(/[><]/, "", a[2]); b[a[2]]=a[1]} /ACCESSION/ {match($0, /ACCESSION.* (.*)/, acc)} /ORIGIN/ {seq=""; while ((getline line)>0) {if (line ~ /\/\//) {break} else {gsub(/[0-9]+/, "", line); gsub(/\r/, "", line); seq=seq toupper(line)}}; gsub(/ /, "", seq)} END {revseq=revcomp(seq); split(revseq, revarr, ""); split(seq, fwdarr, ""); for (i in b) {res=">" acc[1] "|" b[i] "|" i "\n"; if (i ~ /complement/) {match(i, /([0-9]+)\.\./, start); match(i, /\.\.([0-9]+)/, end); for (j=length(revseq)-end[1]+1; j<=length(revseq)-start[1]+1; j++) {res=res revarr[j]}} else if (i ~ /join/) {match(i, /\((.+)\)/, locs); split(locs[1], locs, ","); for (idx in locs) {match(locs[idx], /([0-9]+)\.\./, start); match(locs[idx], /\.\.([0-9]+)/, end); for (j=start[1]; j<=end[1]; j++) {res=res fwdarr[j]}}} else {match(i, /([0-9]+)\.\./, start); match(i, /\.\.([0-9]+)/, end); for (j=start[1]; j<=end[1]; j++) {res=res fwdarr[j]}}; print res}}' file.gb
-
-
Roll forward a number of nucleotides (in circular topology)
awk -v id="" -v roll=0 '$0 ~ "^>" id {getline seq; if (roll > length(seq)) {exit}; print $0 "_before" "\n" seq; split(seq "" seq, a, ""); b=""; for (i=roll+1; i<=length(a) && length(b)<length(seq); i++) {b=b "" a[i]}; print $0 "_after" "\n" b}' file.fa
Utility
Tables
- Convert csv to tsv.
awk 'BEGIN {FPAT="([^,]*)|(\"[^\"]+\")"; OFS="\t"} {for (i=1; i<=NF; ++i) {if (substr($i, 1, 1)=="\"") {$i=substr($i, 2, length($i)-2)}}; for (i=1; i<=NF; i++) printf "%s%s", $i, (i==NF ? "\n" : OFS)}' file.csv
- Convert tsv to csv.
awk 'BEGIN {FS="\t"; OFS=","} {rebuilt=0; for(i=1; i<=NF; i++) {if ($i~/,/ && $i!~/^".*"$/) {gsub("\"", "\"\"", $i); $i="\""$i"\""; rebuilt=1}}; if (!rebuilt) {$1=$1} print}' file.tsv
- Remove a column (here I remove the 2nd column).
awk -v k=2 'BEGIN {FS=OFS="\t"} {$k=""; for (i=1; i<=NF; i++) {if ($i!="") printf "%s%s", $i, (i<NF ? OFS : ORS)}}' table.tsv
- Swap a column with another (here I swap the 1st column and the 2nd column).
awk -v m=1 -v n=2 'BEGIN {FS=OFS="\t"} {t=$m; $m=$n; $n=t; print}' table.tsv
- Filter by column sum.
awk -v n=0 'BEGIN {FS=OFS="\t"} {for (i=2; i<=NF; i++) colsum[i]+=$i; lines[NR]=$0} END {for (row=1; row<=NR; row++) {split(lines[row], fields, FS); printf "%s", fields[1]; for (i=2; i<=NF; i++) if (colsum[i] > n) printf "%s%s", OFS, fields[i]; printf ORS}}' file.tsv
- Filter by row sum.
awk -v n=0 'BEGIN {FS=OFS="\t"} {rowsum=0; for (i=2; i<=NF; i++) rowsum+=$i; if (rowsum > n) print $0}' file.tsv
- Transpose table.
awk 'BEGIN {FS=OFS="\t"} {if (max_cols<NF) {max_cols=NF}; max_rows=NR; for (i=1; i<=NF; i++) {a[i][NR]=$i}} END {for (i=1; i<=max_cols; i++) {printf "%s%s", a[i][1], OFS; for (j=2; j<=max_rows; j++) {printf "%s%s", a[i][j], (j==max_rows ? "" : OFS)}; printf "\n"}}' table.tsv
- Melt table.
awk 'BEGIN {FS=OFS="\t"} {if (max_cols<NF) {max_cols=NF}; max_rows=NR; for (i=1; i<=NF; i++) {a[i][NR]=$i}} END {for (i=2; i<=max_cols; i++) {for (j=max_rows; j>=2; j--) {print a[i][1], a[1][j], a[i][j]}}}' table.tsv
- Outer join (for multiple tables).
awk 'BEGIN {FS=OFS="\t"} {a[$1][ARGIND]=$2} END {for (i in a) {printf "%s%s", i, OFS; for (j=1; j<=ARGIND; j++) {k=(j in a[i] ? a[i][j] : 0); printf "%s%s", k, (j<ARGIND ? OFS : ORS)}}}' table*.tsv
Input:
Table1.tsv:
C1 R2 4 R3 3 Table2.tsv:
C2 R1 6 R3 7 R4 9 Table3.tsv:
C3 R1 1 R5 8 Output:
C1 C2 C3 R1 0 6 1 R2 4 0 0 R3 3 7 0 R4 0 9 0 R5 0 0 8
- Outer join (for multiple tables) using filenames as column names.
awk 'BEGIN {FS=OFS="\t"} {a[$1][ARGIND]=$2; filenames[ARGIND]=FILENAME} END {printf "ID%s", OFS; for (x=1; x<=ARGIND; x++) printf "%s%s", filenames[x], (x<ARGIND ? OFS : ORS); for (i in a) {printf "%s%s", i, OFS; for (j=1; j<=ARGIND; j++) {k=(j in a[i] ? a[i][j] : 0); printf "%s%s", k, (j<ARGIND ? OFS : ORS)}}}' table*.tsv
- Inner join (for multiple tables).
awk 'BEGIN {FS=OFS="\t"} {a[$1][ARGIND]=$2} END {for (i in a) {printf "%s%s", i, OFS; for (j=1; j<=ARGIND; j++) {k=(j in a[i] ? a[i][j] : 0); printf "%s%s", k, (j<ARGIND ? OFS : ORS)}}}' table*.tsv | awk 'BEGIN {FS=OFS="\t"} NR==1 {print} NR>1 {if (!match($0, "0")) print}'
- Exclusive join (for multiple tables).
awk 'BEGIN {FS=OFS="\t"} {a[$1][ARGIND]=$2} END {for (i in a) {printf "%s%s", i, OFS; for (j=1; j<=ARGIND; j++) {k=(j in a[i] ? a[i][j] : 0); printf "%s%s", k, (j<ARGIND ? OFS : ORS)}}}' table*.tsv | awk 'BEGIN {FS=OFS="\t"} NR==1 {print} NR>1 {if (match($0, "0")) print}'
- Right join (for two tables).
awk 'BEGIN {FS=OFS="\t"} FNR==NR {a[$1]=$2; next} {print ($1 in a ? $0 OFS a[$1] : $0 OFS 0)}' table1.tsv table2.tsv
- Find inclusive and exclusive elements based on one column.
awk 'BEGIN {FS=OFS="\t"} {a[$1][FILENAME]=b[$1]+=1} END {for (i in b) {if (b[i]==1) {for (j in a[i]) {print i, j}} else if (b[i]>1) {j=""; for (k in a[i]) {j=j k " "}; print i, j}}}' table*.tsv
- Deduplicate indices and sum values associated with those indices.
awk 'BEGIN {FS=OFS="\t"} NR==1 {print; next} {label[$1]++; for (i=1; i<=NF; i++) {dup[$1][i]+=$i}} END {for (i in label) {printf "%s%s", i, OFS; for (j=2; j<=NF; j++) {$j=dup[i][j]; printf "%s%s", $j, (j<NF ? OFS : ORS)}}}' table.tsv
Input:
C1 C2 C3 R1 2 3 8 R1 5 1 6 R2 0 4 5 R3 1 7 9 Output:
C1 C2 C3 R1 7 4 14 R2 0 4 5 R3 1 7 9
- Sum by column.
awk 'BEGIN {FS=OFS="\t"} NR==1 {gsub(/^\t/, "", $0); print; next} {for (i=2; i<=NF; i++) sum[i]+=$i} END {for (i in sum) printf "%s%s", sum[i], (i==length(sum)+1 ? "\n" : "\t")}' table.tsv
- Sum by row.
awk 'BEGIN {FS=OFS="\t"} NR>1 {for (i=2; i<=NF; i++) {label[NR]=$1; sum[NR]+=$i}} END {for (i in sum) print label[i], sum[i]}' table.tsv
- Calculate percentage by column.
awk 'function percent(value, total) {return (total!=0 ? sprintf("%.2f", 100*value/total) : "NA")} BEGIN {FS=OFS="\t"} NR==1 {print; next} {label[NR]=$1; for (i=2; i<=NF; i++) {sum[i]+=col[i][NR]=$i}} END {for (i=2; i<=NR; i++) {$1=label[i]; for (j=2; j<=NF; j++) {$j=percent(col[j][i], sum[j])}; print}}' table.tsv
- Calculate percentage by row.
awk 'function percent(value, total) {return (total!=0 ? sprintf("%.2f", 100*value/total) : "NA")} BEGIN {FS=OFS="\t"} NR==1 {print; next} {label[NR]=$1; for (i=2; i<=NF; i++) {sum[NR]+=col[i][NR]=$i}} END {for (i=2; i<=NR; i++) {$1=label[i]; for (j=2; j<=NF; j++) {$j=percent(col[j][i], sum[i])}; print}}' table.tsv
- Select row based on max or min value of a column.
Example: I take the result from BLAST outfmt 6 to select the row based on the highest percent identity and the highest query coverage.
awk 'function abs(n) {return (n>0 ? n : -n)} BEGIN {FS=OFS="\t"} ($3>max_pident[$1] && abs($(NF-3)-$(NF-2)+1)/$5>=max_qcov[$1]) {max_pident[$1]=$3; max_qcov[$1]=abs($(NF-3)-$(NF-2)+1)/$5; a[$1]=$0} END {for (i in a) print a[i]}' result.out
Input:
qseqid sseqid pident length qlen slen evalue bitscore mismatch gapopen qstart qend sstart send sif8011_100121348_f_151 sodA_153 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546 sif8011_100121348_f_151 sodA_108 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546 sif8011_100121348_f_151 sodA_96 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546 sif8011_100121348_f_151 sodA_88 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546 sif8011_100121348_f_151 sodA_71 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546 sif8011_100121348_r_151 sodA_153 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 sif8011_100121348_r_151 sodA_108 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 sif8011_100121348_r_151 sodA_96 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 sif8011_100121348_r_151 sodA_88 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 sif8011_100121348_r_151 sodA_71 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 Output:
qseqid sseqid pident length qlen slen evalue bitscore mismatch gapopen qstart qend sstart send sif8011_100121348_r_151 sodA_153 87.931 58 151 554 1.10e-13 69.4 7 0 84 141 546 489 sif8011_100121348_f_151 sodA_153 87.931 58 151 554 1.10e-13 69.4 7 0 73 130 489 546
- Filter BLAST result by removing results with shorter length and lower pident.
awk 'BEGIN {FS=OFS="\t"} {a[$1][count++]=$0} END {for (i in a) {for (j=0; j<count; j++) {for (k=j+1; k<count; k++) {split(a[i][j], s1, " "); split(a[i][k], s2, " "); start1=s1[length(s1)-3]; end1=s1[length(s1)-2]; start2=s2[length(s2)-3]; end2=s2[length(s2)-2]; pident1=s1[3]; pident2=s2[3]; if (start1<end1 && start2<end2 && start1>=start2 && end1<=end2) {if (pident1>=pident2) {delete a[i][k]} else {delete a[i][j]}} else if (start1>end1 && start2>end2 && start1<=end2 && start1>=end2) {if (pident1>=pident2) {delete a[i][k]} else {delete a[i][j]}}}}}; for (i in a) for (j in a[i]) if (a[i][j]!="") print a[i][j]}' table.tsv
- Group homologs from BLAST result.
awk 'function recfind(a) {for (i in a) {for (m in a) {for (n in a[m]) {if (m!=i && m in a[i]) {a[i][n]; delete a[m][n]; recfind(a)} else if (m!=i && n!=i && n in a[i]) {a[i][m]}}}}} BEGIN {FS=OFS="\t"} {a[$1][$2]} END {recfind(a); for (i in a) {t=i; for (j in a[i]) {if (i!=j) t=t" "j}; if (t~/\s/) printf "%d\t%s\n", ++count, t}}' table.tsv
Input:
query subject S1 S2 S1 S3 S1 S4 S2 S4 S2 S5 S5 S6 S7 S8 S7 S9 S8 S9 S10 S2 S11 S2 S12 S13 Output:
1 S1 S2 S3 S10 S4 S11 S5 S6
2 S12 S13
3 S7 S8 S9
- Find all homologs of queries from BLAST result (set p and q for pident and qcov).
awk -v p=98 -v q=95 'function abs(n) {return (n>0 ? n : -n)} function recfind(a) {for (i in a) {for (m in a) {for (n in a[m]) {if (m!=i && m in a[i]) {a[i][n]; delete a[m][n]; recfind(a)} else if (m!=i && n!=i && n in a[i]) {a[i][m]}}}}} BEGIN {FS=OFS="\t"} ($3>=p && abs($(NF-3)-$(NF-2)+1)*100/$5>=q) {a[$1][$2]} END {recfind(a); for (i in a) {for (j in a[i]) if (i!=j) print i, j}}' result.out
Parallel processing of BLAST result with parallel.
cat result.out | parallel --pipe -q awk -v p=98 -v q=95 'function abs(n) {return (n>0 ? n : -n)} function recfind(a) {for (i in a) {for (m in a) {for (n in a[m]) {if (m!=i && m in a[i]) {a[i][n]; delete a[m][n]; recfind(a)} else if (m!=i && n!=i && n in a[i]) {a[i][m]}}}}} BEGIN {FS=OFS="\t"} ($3>=p && abs($(NF-3)-$(NF-2)+1)*100/$5>=q) {a[$1][$2]} END {recfind(a); for (i in a) {for (j in a[i]) if (i!=j) print i, j}}' > clusters.txt
- Find values that appear in all indices.
awk 'BEGIN {FS=OFS="\t"} {f=!a[$1]++; b[$2]++} END {for (i in b) {if (b[i]==length(a)) print i}}' table.tsv
Input:
index value S1 a S1 b S2 a S2 b S2 c S3 a S3 b S4 a S4 b S4 c Output:
a
b
- Count with grouping.
Example: I count the occurence with the value in the second column with grouping based on the first column.
awk 'BEGIN {FS=OFS="\t"} {count[$1][$2]++} END {for (i in count) {for (j in count[i]) {print i, j, count[i][j]}}}' table.tsv
Input:
Sequence Gene Seq1 Gene1 Seq1 Gene1 Seq1 Gene2 Seq2 Gene1 Seq2 Gene2 Seq3 Gene3 Seq3 Gene3 Seq3 Gene3 Output:
Sequence Gene Seq1 Gene1 2 Seq1 Gene2 1 Seq2 Gene1 1 Seq2 Gene2 1 Seq3 Gene3 3
- Display the most occurring value with grouping.
awk 'BEGIN {FS=OFS="\t"} {count[$1][$2]++; max[$1]=(max[$1]>count[$1][$2] ? max[$1] : count[$1][$2])} END {for (i in count) {for (j in count[i]) {if (count[i][j]==max[i]) print i, j, count[i][j]}}}' table.tsv
Same input in the example above.
Output:
Sequence Gene Seq1 Gene1 2 Seq2 Gene1 1 Seq2 Gene2 1 Seq3 Gene3 3
Text
- Insert 1st line from a file to another.
sed -i "1i $(sed -n '1p' line_from.txt)" line_to.txt
sed -i "1s/^/$(sed -n '1p' line_from.txt)\n/" line_to.txt
- Remove the last n-th line. Here, I remove the last five lines (from qstebom).
sed -n -e ':a; 1,5!{P; N; D}; N; ba' text.txt
sed -e ':a; $d; N; 1,5ba; P; D' text.txt
- Join lines together with comma as delimiter.
sed -E ':a; N; $!ba; s/\n/,/g' text.txt
- Split the file into parts.
awk -v n=4 '{split(FILENAME, file, "."); a[count++]=$0} END {step=sprintf("%.0f", count/n); for (i=1; i<=n+1; i++) {for (j=step*(i-1); j<step*i; j++) {if (a[j]!="") print a[j] >> file[1] i "." file[2]}}}' file.txt
- Join lines between patterns.
awk '/patternA/ {f=1; concat=$0; next} f {concat=concat "\n" $0; gsub(/\n|\r/, "", concat)} /patternB/ {print concat; concat=""; f=0; next} !f {print}' file.txt
- Interleave line by line (for multiple text files).
awk '{for (i=1; i<ARGC; i++) {getline < ARGV[i]; printf "%s%s", $0, (i<ARGC-1 ? OFS : ORS)}}' text*.txt
awk 'BEGIN {do {flag=channel=0; while (++channel<ARGC) {if (getline < ARGV[channel]) {printf "%s", (channel<ARGC-1 ? $0 "\t" : $0 "\n")}; flag=1}} while (flag)}' text*.txt
- Interleave line by nth line (here is 4 lines for multiple text files).
awk -v step=4 '{for (i=1; i<ARGC; i++) {j=step; while (j>0) {getline < ARGV[i]; printf "%s\n", $0; j--}}}' text*.txt
- Merge strings with common substring.
awk 'function concat(str1, str2) {split(str1, a1, ""); split(str2, a2, ""); while (1) {for (i in a1) {s1=substr(str1, i); if (str2~s1 && length(s1)>=3 && index(str2, s1)==1) {print s1; pos1=i; break}}; for (i in a2) {s2=substr(str2, i); if (str1~s2 && length(s2)>=3 && index(str1, s2)==1) {print s2; pos2=i; break}}; break}; c=""; if (pos1!="") {for (i=1; i<pos1+0; i++) {c=c a1[i]}; c=c str2} else if (pos2!="") {for (i=1; i<pos2+0; i++) {c=c a2[i]}; c=c str1}; return c} {split($0, a, " "); print concat(a[1], a[2])}' file.txt
Input:
ThisIsBeautiful IsBeautiful,IsIt?
Output:
IsBeautiful
ThisIsBeautiful,IsIt?
- Convert GFF to GTF from Dzmitry Mukha.
awk 'BEGIN {FS=OFS="\t"} {if ($0~/#/) {next}; if ($3~/RNA/) {subname="transcript"} else {subname=$3}; str=$9; sub(".*ID=", "", str); sub(";.*", "", str); annot[subname"_id"] = str; annot["gene_id"] = str; annot["transcript_id"] = str; str=$9; sub(".*Name=", "", str); sub(";.*", "", str); annot[subname"_name"] = str; gsub("Note=", "note \"", $9); gsub(";", "\"; ", $9); gsub("=", " \"", $9); if ($9!~".*; $") {$9=$9 "\"; "}; sub("ID", subname"_id", $9); sub("Name", subname"_name", $9); if ($3 == "gene") {strand=$7; en=0; $9 = "gene_id \""annot["gene_id"]"\"; "; print} else if ($3=="mRNA") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; "; print} else if ($3=="CDS") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; "; $9=$9"gene_name \""annot["gene_name"]"\"; transcript_name \""annot["transcript_name"]"\"; "; print} else if ($3=="exon") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; exon_number \""++en"\"; gene_name \""annot["gene_name"]"\"; transcript_name \""annot["transcript_name"]"\"; "; print} else if ($3=="three_prime_UTR") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; "; print} else if ($3=="five_prime_UTR") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; "; print} else if ($3=="protein") {$9="gene_id \""annot["gene_id"]"\"; transcript_id \""annot["transcript_id"]"\"; gene_name \""annot["gene_name"]"\"; transcript_name \""annot["transcript_name"]"\";"; print}}' file.gff
Random sampling reads/sequences with reservoir sampling
Credit to Umer Zeeshan Ijaz
I’ve made some improvements to make it more readable and easy to understand. Here I select k=10000, set k to the desired number of reads/sequences.
- Subsample paired-end reads in fastq format.
awk -v k=10000 -v seed=3 'BEGIN {srand(seed); r=rand()} fname!=FILENAME {fname=FILENAME; idx++} idx==1 {f1=FILENAME; if (NR%4==1 && $0~/^@/) {getline l2; getline l3; getline l4; s=(i++<k ? i-1 : int(r*i)); if (s<k) a[s]=$0 "\t" l2 "\t" l3 "\t" l4}} idx==2 {f2=FILENAME; if (NR%4==1 && $0~/^@/) {getline l2; getline l3; getline l4; s=(j++<k ? j-1 : int(r*j)); if (s<k) b[s]=$0 "\t" l2 "\t" l3 "\t" l4}} END {for (i in a) {gsub(/\t/, "\n", a[i]); print a[i] > "subsampled_" f1}; for (i in b) {gsub(/\t/, "\n", b[i]); print b[i] > "subsampled_" f2}}' forward.fastq reverse.fastq
- Subsample paired-end reads in fasta format.
awk -v k=10000 -v seed=3 'BEGIN {srand(seed); r=rand()} fname!=FILENAME {fname=FILENAME; idx++} idx==1 {f1=FILENAME; if (NR%2==1 && $0~/^>/) {getline seq; s=(i++<k ? i-1 : int(r*i)); if (s<k) a[s]=$0 "\t" seq}} idx==2 {f2=FILENAME; if (NR%2==1 && $0~/^>/) {getline seq; s=(j++<k ? j-1 : int(r*j)); if (s<k) b[s]=$0 "\t" seq}} END {for (i in a) {gsub(/\t/, "\n", a[i]); print a[i] > "subsampled_" f1}; for (i in b) {gsub(/\t/, "\n", b[i]); print b[i] > "subsampled_" f2}}' forward.fasta reverse.fasta
- Subsample single-end reads in fastq format.
awk -v k=10000 -v seed=3 'BEGIN {srand(seed)} NR%4==1 && /^@/ {getline l2; getline l3; getline l4; s=(i++<k ? i-1 : int(rand()*i)); if (s<k) a[s]=$0 "\t" l2 "\t" l3 "\t" l4} END {for (i in a) {gsub(/\t/, "\n", a[i]); print a[i]}}' file.fastq
- Subsample single-end reads in fasta format (can also be used for subsampling without replacement of sequences).
awk -v k=10000 -v seed=3 'BEGIN {srand(seed)} /^>/ {getline seq; s=(i++<k ? i-1 : int(rand()*i)); if (s<k) a[s]=$0 "\t" seq} END {for (i in a) {gsub(/\t/, "\n", a[i]); print a[i]}}' file.fasta
- Shuffle fasta.
awk -v seed=1 'BEGIN {srand(seed)} /^>/ {getline seq; a[s++]=$0 "\n" seq} END {for (i=0; i<s; i++) {idx[i]=i}; for (i=s; i>0; i--) {k=int(rand()*i); tmp=idx[k]; idx[k]=idx[i-1]; idx[i-1]=tmp}; for (i=0; i<s; i++) {print a[idx[i]]}}' file.fa
- Shuffle fastq.
awk -v seed=1 'BEGIN {srand(seed)} NR%4==1 && /^@/ {getline seq; getline id2; getline qual; a[s++]=$0 "\n" seq "\n" id2 "\n" qual} END {for (i=0; i<s; i++) {idx[i]=i}; for (i=s; i>0; i--) {k=int(rand()*i); tmp=idx[k]; idx[k]=idx[i-1]; idx[i-1]=tmp}; for (i=0; i<s; i++) {print a[idx[i]]}}' file.fq
Tips and tricks
- Handle multiple files. Here is an example when working with three files (from Kai Yuan).
awk 'fname!=FILENAME {fname=FILENAME; idx++} idx==1 {} idx==2 {} idx==3 {}' file1.txt file2.txt file3.txt
- Get k-nucleotides (from KamilCuk).
awk -v k=4 'function clone(original, copy) {for (i in original) {if (isarray(original[i])) {copy[i][1]=""; delete copy[i][1]; clone(original[i], copy[i])} else {copy[i]=original[i]}}} BEGIN {split("ATGC", a, ""); clone(a, b); i=0; while (i<k-1) {i++; temp=""; for (m in a) {for (n in b) {temp=temp (temp ? " " : "") a[m] b[n]}}; split(temp, b, " ")}; for (i in b) print i, b[i]}'
- Get palindromic k-nucleotides.
awk -v k=4 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for(i=length(s); i>0; i--) {o=o c[substr(s, i, 1)]}; return o} function clone(original, copy) {for (i in original) {if (isarray(original[i])) {copy[i][1]=""; delete copy[i][1]; clone(original[i], copy[i])} else {copy[i]=original[i]}}} BEGIN {split("ATGC", a, ""); clone(a, b); i=0; while (i<k-1) {i++; temp=""; for (m in a) {for (n in b) {temp=temp (temp ? " " : "") a[m] b[n]}}; split(temp, b, " ")}; for (j in b) {if (b[j]==revcomp(b[j])) {print j, b[j]}}}'
- Get the k-mer frequency for each sequence.
awk -v k=n 'BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++; sum++}; for (i in a) {printf "%s\t%s\t%.3f\n", $0, i, a[i]/sum}; delete a}' file.fa
- Get the k-mer frequency for the whole file.
awk -v k=n 'BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++; sum++}} END {for (i in a) {print i, a[i]}}' file.fa
- Get non-repeated k-mer frequency.
awk -v k=n 'BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} function findreps(s) {for (i=2; i<=length(s); i++) {if (substr(s, i, 1)!=substr(s, 1, 1)) {return 0}}; return 1} /^>/ {getline seq; sub(/^>/, "", $0); count=0; for (j=1; j<=length(seq)+1-k; j++) {subs=substr(seq, j, k); sum++; if (findreps(subs)==0) {a[subs]++}}; for (i in a) {printf "%s\t%s\t%.2f\n", $0, i, a[i]/sum}; delete a}' file.fa
- Find unique k-mer in all sequences.
awk -v k=n '/^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s][$0]}} END {for (i in a) {if (length(a[i])==1) {for (j in a[i]) print j"\t"i}}}' file.fa
- Get the palindromic k-mer frequency (n is even number) amongst palindromes.
awk -v k=n 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (t=length(s); t>0; t--) {o=o c[substr(s, t, 1)]}; return o} BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); if (s==revcomp(s)) {a[s]++; sum++}}; for (i in a) {printf "%s\t%s\t%.2f\n", $0, i, a[i]/sum}; delete a}' file.fa
- Get the palindromic k-mer frequency (n is even number) amongst other k-mers.
awk -v k=n 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (t=length(s); t>0; t--) {o=o c[substr(s, t, 1)]}; return o} BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++; sum++}; for (j in a) {if (j==revcomp(j)) {printf "%s\t%s\t%.2f\n", $0, j, a[j]/sum}}; delete a}' file.fa
- Find all possible palindromes e.g., ATAT.
awk 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (t=length(s); t>0; t--) {o=o c[substr(s, t, 1)]}; return o} BEGIN {PROCINFO["sorted_in"]="@ind_str_desc"; OFS="\t"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=4; i<=length(seq); i+=2) {for (j=1; j<=length(seq)+1-i; j++) {s=substr(seq, j, i); if (s==revcomp(s)) {a[$0][s]++}}}} END {for (i in a) {for (j in a[i]) {is_substring=0; for (k in a[i]) {if (j!=k && index(k, j)>0) {is_substring=1; break}}; if (!is_substring) {b[i][j]=a[i][j]}}}; for (i in b) {for (j in b[i]) {print i, j, b[i][j]}}}' file.fa
- Find all alphanumeric palindromes e.g., TATA.
awk 'function reverse(s) {o=""; for (k=length(s); k>0; k--) {o=o substr(s, i, 1)}; return o} BEGIN {PROCINFO["sorted_in"]="@ind_str_desc"; OFS="\t"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=4; i<=length(seq); i+=2) {for (j=1; j<=length(seq)+1-i; j++) {s=substr(seq, j, i); if (s==reverse(s)) {a[$0][s]++}}}} END {for (i in a) {for (j in a[i]) {is_substring=0; for (k in a[i]) {if (j!=k && index(k, j)>0) {is_substring=1; break}}; if (!is_substring) {b[i][j]=a[i][j]}}}; for (i in b) {for (j in b[i]) {print i, j, b[i][j]}}}' file.fa
- Find hairpin loops. Here the length of the loop is 20 with a spacer of 6.
awk -v k=14 -v g=6 'function revcomp(s) {c["A"]="T"; c["C"]="G"; c["G"]="C"; c["T"]="A"; o=""; for (t=length(s); t>0; t--) {o=o c[substr(s, t, 1)]}; return o} BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++; sum++}; for (j in a) {head=substr(j, 0, length(j)/2-g/2); tail=substr(j, length(j)/2+1+g/2, length(j)); if (head=revcomp(tail)) {full=head ".{" g "}" tail; if (j~full) {printf "%s\t%s\t%.3f\n", $0, j, a[j]/sum}}}}' file.fa
- Find quasipalindrome.
awk -v k=6 -v cutoff=1 'function quasipalindrome(s) {c["A"]="T"; c["T"]="A"; c["G"]="C"; c["C"]="G"; split(s, t, ""); score=0; for (m=1; m<=length(t); m++) {if (t[m]!=c[t[length(t)+1-m]]) {score++}}; return (score>cutoff*2 ? 0 : 1)} BEGIN {PROCINFO["sorted_in"]="@val_num_desc"} /^>/ {getline seq; sub(/^>/, "", $0); for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++; sum++}; for (j in a) {if (quasipalindrome(j)) {printf "%s\t%s\t%.3f\n", $0, j, a[j]/sum}}}' file.fa
- Print in reversed order (like tac).
awk '{a[i++]=$0} END {while (i--) print a[i]}' file
- Find longest non-repeated substring
awk '{longest=""; for (i=1; i<=length($0); i++) {current=""; for (j=i; j<=length($0); j++) {if (index(current, substr($0, j, 1))==0) {current=current substr($0, j, 1); if (length(current)>length(longest)) {longest=current}} else {break}}}; print longest}'
awk '{longest=""; for (i=1; i<=length($0); i++) {current=""; for (j=i; j<=length($0); j++) {if (index(current, substr($0, j, 1))==0) {current=current substr($0, j, 1); if (length(current)>=length(longest)) {longest=(length(current)>length(longest) ? current : longest "\n" current)}} else {break}}}; print longest}'
- Clone awk array (from Stephen Kitt).
awk 'function clone(original, copy) {for (i in original) {if (isarray(original[i])) {copy[i][1]=""; delete copy[i][1]; clone(original[i], copy[i])} else {copy[i]=original[i]}}}'
-
Generate random sequences (set seed, number of sequences n and length of sequences l).
-
Fasta
-
Nucleotides:
awk -v seed=3 -v n=100 -v l=1000 'BEGIN {srand(seed); split("ATGC", a, ""); for (s=1; s<=n; s++) {out=""; len=int(rand()*l+1); while (length(out)<len || length(out)<0.5*l) {out=out a[int(rand()*length(a)+1)]}; printf ">%."length(n)"d\n%s\n", s, out}}'
-
Amino acids:
awk -v seed=3 -v n=100 -v l=1000 'BEGIN {srand(seed); split("ACDEFGHIKLNPQRSTVWY", a, ""); for (s=1; s<=n; s++) {out="M"; len=int(rand()*l+1); while (length(out)<len || length(out)<0.5*l) {out=out a[int(rand()*length(a)+1)]}; printf ">%."length(n)"d\n%s\n", s, out}}'
-
-
Fastq:
awk -v seed=3 -v phred=33 -v threshold=20 -v n=100 -v l=1000 -l ordchr 'BEGIN {score=threshold+phred; srand(seed); split("ATGC", a, ""); for (s=1; s<=n; s++) {out=""; qual=""; len=int(rand()*l+1); while (length(out)<len || length(out)<0.5*l) {out=out a[int(rand()*length(a)+1)]; qual=qual chr(int(rand()*score+score))}; printf "@%." length(n) "d\n%s\n+\n%s\n", s, out, qual}}'
-
-
Get the homologous sequences from BLAST result.
awk 'function abs(n) {return (n>0 ? n : -n)} BEGIN {FS="\t"; OFS="\n"} ($3>=95 && abs($(NF-3)-$(NF-2)+1)*100/$5>90) {a[$1][$2]; next} /^>/ {getline seq; for (i in a) {for (j in a[i]) {if ($0==">"j) {print $0, seq > i"_homologs.fasta"}}}}' blast.out database.fasta
awk 'fname!=FILENAME {fname=FILENAME; idx++} idx==1 {FS="\t"; if ($3>=90 && $4/$5>=0.9) {a[">"$1][">"$2]++}} idx==2 {if ($0~/^>/) {getline seq; b[$0]=seq}} idx==3 {if ($0~/^>/) {getline seq; c[$0]=seq}} END {for (i in b) {if (i in a) {print i"\n"b[i]; for (j in c) {if (a[i][j]) print j"\n"c[j]}}}}' blast.out query.fasta subject.fasta
- Generate Google Drive download link for wget or curl
echo "<Google Drive shared link>" | sed 's|/file/d/|/uc?id=|' | sed 's|/view.*||'
- Filter sequences using its k-mer profile
awk -v k=11 'NR%4==1 && /^@/ {getline seq; for (i=1; i<=length(seq)+1-k; i++) {s=substr(seq, i, k); a[s]++}; getline id; getline qual; b[seq]=$0 "\n" seq "\n" id "\n" qual} END {for (i in a) {if (a[i]>1) {for (j in b) {if (index(j, i)>0) {print b[j]; delete b[j]}}}}}'
-
Sort sequences based on length
-
Fasta:
awk 'BEGIN {PROCINFO["sorted_in"]="@ind_num_desc"} /^>/ {getline seq; a[length(seq)][$0 "\n" seq]} END {for (i in a) {for (j in a[i]) print j}}'
-
Fastq:
awk 'BEGIN {PROCINFO["sorted_in"]="@ind_num_desc"} NR%4 && /^@/ {getline seq; getline id; getline qual; a[length(seq)][$0 "\n" seq "\n" id "\n" qual]} END {for (i in a) {for (j in a[i]) print j}}'
-
-
Pebblescout for quick sequence scan.
awk '/^>/ {gsub(/ /, "_", $0); printf $0 " "; getline; print}' <(zcat file.fastq.gz | awk 'NR%4==1 {sub(/^@/, ">", $0); print; getline; print}') | xargs -P 4 -n 2 bash -c 'id="$0"; seq="$1"; nrow=1; boundary=$(awk '\''BEGIN {for(i=0;i<8;i++) printf "%c", int(rand()*26) + 97}'\''); cmd="curl -s '\''https://pebblescout.ncbi.nlm.nih.gov/sra-cl-be/sra-cl-be.cgi?rettype=pebblescout'\'' -H '\''content-type: multipart/form-data; boundary=${boundary}'\'' --data-raw $'\''--${boundary}\r\nContent-Disposition: form-data; name=\"m\"\r\n\r\n2\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"g\"\r\n\r\n11924\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"c\"\r\n\r\n1030\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"_r\"\r\n\r\n${nrow}\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"accession\"\r\n\r\n\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"_h\"\r\n\r\n1001\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"fasta\"\r\n\r\n${id}\r\n${seq}\r\n\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"from\"\r\n\r\n\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"to\"\r\n\r\n\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"db\"\r\n\r\nrefseq\r\n--${boundary}\r\nContent-Disposition: form-data; name=\"retmode\"\r\n\r\n\r\n--${boundary}--\r\n'\'' --compressed"; eval $cmd | awk -v n=$nrow '\''/QueryID/ {for (i=0; i<n; i++) {getline; if ($0!~/^#.*$/ && $0!~/^$/) {print}}}'\'''
- Detect CpG islands using Gardiner-Garden and Frommer (1987) method.
awk -v window=200 -v threshold=0.6 'function cpgisland(s, w, t) {found=0; numC=0; numG=0; numCG=0; s=tolower(s); if (w>length(s)) {print "Input sequence must be longer than " w " bases!"}; for (i=1; i<=length(s); i++) {base=substr(s, i, 1); if (base=="g") {numG++}; if (base=="c") {numC++; next_base=substr(s, i+1, 1); if (next_base=="g") {numCG++; numG++; i++}}; if (i>w) {Y=(numC && numG ? numCG/((numC*numG)/w) : 0); GCcontent=(numC+numG)*100/w; if (Y>=t && GCcontent>50) {start=i-w+1; end=i; found=1; print start ".." end "\t" Y "\t" GCcontent}; tolose=substr(s, i-w+1, 1); if (tolose=="c") {numC--; next_tolose=substr(s, i-w+2, 1); if (next_tolose=="g") {numCG--}}; if (tolose=="g") {numG--}}}; if (found==0) {print "No CpG island identified"}} /^>/ {getline seq; print $0; print cpgisland(seq, window, threshold)}' file.fa
- Find stretch of repeated characters
awk 'function findrepeat(s) {
max = 0
current = 1
delete a
for (i = 2; i <= length(s); i++) {
if (substr(s, i, 1) == substr(s, i-1, 1)) {
current++
} else {
if (current > max) {
max = current
a[substr(s, i-1, 1)][i-current] = max
}
current = 1
}
}
# Check for the maximum streak at the end of the sequence
if (current > max) {
max = current
a[substr(s, length(s), 1)][length(s)+1-current] = max
}
# Return the character with the longest streak
for (char in a) {
for (loc in a[char]) {
if (a[char][loc] == max) {
return char "\t" loc "\t" max
}
}
}
}
/^>/ {
getline seq
sub(/^>/, "", $0)
print $0 "\t" findrepeat(seq)
}' file.fa
- Highlight differences between sequences of same length.
awk -v width=60 -v start=0 -v end=0 '
BEGIN {
# Define ANSI color codes
RED = "\033[31m"
RESET = "\033[0m"
}
/^>/ {
getline seq
sequences[++count] = seq
len = length(seq)
}
END {
if (start == 0) {
start = 1
}
if (end == 0 || end > len) {
end = len
}
for (i in sequences) {
if (length(sequences[i]) != len) {
exit 1
}
}
for (pos = start; pos <= end; pos += width) {
chunk_end = (pos + width - 1 <= end) ? (pos + width - 1) : end
# Collect and compare characters for the current chunk
for (i = pos; i <= chunk_end; i++) {
for (j = 1; j <= count; j++) {
char_chunk[j][i] = substr(sequences[j], i, 1)
}
}
for (i = pos; i <= chunk_end; i++) {
for (j = 1; j <= count; j++) {
for (k = j + 1; k <= count; k++) {
if (char_chunk[j][i] != char_chunk[k][i]) {
char_chunk[j][i] = RED char_chunk[j][i] RESET
char_chunk[k][i] = RED char_chunk[k][i] RESET
}
}
}
}
# Print the chunk for each sequence
for (j = 1; j <= count; j++) {
for (i = pos; i <= chunk_end; i++) {
printf "%s", char_chunk[j][i]
}
printf "\n"
}
printf "\n"
}
}
' file.fa
- Remove repetitive subsequences and keep only one occurrence.
awk '/^>/ {
getline seq
input = tolower(seq) # Convert input to lowercase for case insensitivity
str_len = length(input) # Length of the input string
found = 0
for (len = 21; len <= int(str_len / 2); len++) {
for (i = 1; i <= str_len - 2 * len + 1; i++) {
substr1 = substr(input, i, len) # Extract first substring
substr2 = substr(input, i + len, len) # Extract second substring
if (substr1 == substr2 && !(substr1 in seen)) {
seen[substr1] = 1
i += len - 1
found = 1
}
}
}
if (found == 0) {
print $0 "\n" seq
} else {
for (i in seen) {
gsub("(" i ")+", i, input)
}
print $0 "\n" input
}
}' file.fa
- Split a line according to the desired length
awk -v n=40 '{ start = 1; while (start <= length($0)) { end = start + n - 1; if (end >= length($0)) { chunk = substr($0, start); print chunk; break } chunk = substr($0, start, n); if (substr($0, end + 1, 1) ~ /[a-zA-Z]/) { space_pos = match(chunk, / [^ ]*$/); if (space_pos > 0) { print substr($0, start, space_pos); start = start + space_pos } else { printf "%s-\n", chunk; start = end + 1 } } else { print chunk; start = end + 1 } } }' file.txt
- Split a file according to certain characters
awk -v delimiter="^//" -v prefix="output_file_prefix_" -v extension=".ext" '{
if ($0 ~ delimiter) {
if (line) {
print line > prefix count++ extension
}
line = ""
next
}
line = line $0 (line == "" ? "" : "\n")
}' file.txt
- Skip a section based on the pattern
awk -v patternA="START" -v patternB="END" '/patternA/ {skip=1; next} /patternB/ {skip=0} !skip' file.txt