Splitting fasta file into smaller files based on header pattern
$begingroup$
I have to split this fasta files into smaller files and write them into individual files my files
>lcl|CP000522.1_prot_ABO13860.1_1 [locus_tag=A1S_3471] [protein=hypothetical protein] [protein_id=ABO13860.1] [location=1..957] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13850.1_2 [locus_tag=A1S_3461] [protein=DNA replication protein] [protein_id=ABO13850.1] [location=950..1504] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13851.1_3 [locus_tag=A1S_3462] [protein=hypothetical protein] [protein_id=ABO13851.1] [location=complement(2523..3437)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13852.1_4 [locus_tag=A1S_3463] [protein=YPPCP.09C-like protein] [protein_id=ABO13852.1] [location=3538..4788] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13853.1_5 [locus_tag=A1S_3464] [protein=Cro-like protein] [protein_id=ABO13853.1] [location=5039..5629] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13854.1_6 [locus_tag=A1S_3465] [protein=hypothetical protein] [protein_id=ABO13854.1] [location=complement(6340..6906)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13855.1_7 [locus_tag=A1S_3466] [protein=Resolvase] [protein_id=ABO13855.1] [location=complement(7074..7685)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13856.1_8 [locus_tag=A1S_3467] [protein=hypothetical protein] [protein_id=ABO13856.1] [location=complement(8602..9732)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13857.1_9 [locus_tag=A1S_3468] [protein=putative lipoprotein] [protein_id=ABO13857.1] [location=complement(10072..10374)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13858.1_10 [locus_tag=A1S_3469] [protein=Diaminopimelate decarboxylase] [protein_id=ABO13858.1] [location=complement(10367..10723)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13859.1_11 [locus_tag=A1S_3470] [protein=regulatory protein LysR] [protein_id=ABO13859.1] [location=complement(12076..12444)] [gbkey=CDS]
The other pattern is
>lcl|CP000523.1_prot_ABO13861.1_1 [locus_tag=A1S_3472] [protein=DNA replication protein] [protein_id=ABO13861.1] [location=1..951] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13862.1_2 [locus_tag=A1S_3473] [protein=hypothetical protein] [protein_id=ABO13862.1] [location=3048..4262] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13863.1_3 [locus_tag=A1S_3474] [protein=hypothetical protein] [protein_id=ABO13863.1] [location=4357..5133] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13864.1_4 [locus_tag=A1S_3475] [protein=hypothetical protein] [protein_id=ABO13864.1] [location=6197..8608] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13865.1_5 [locus_tag=A1S_3476] [protein=secretory lipase] [protein_id=ABO13865.1] [location=8705..9403] [gbkey=CDS]
So now my idea is how do i parse and write them into individual files such that CP000522 output written to one file and CP000523 written to another file so forth and so on.
So far what i understand is i have to match the pattern after >lcl
so there are other patterns like "LN997847
" in the file
Not sure how to proceed tried it in R but failed
it can be done with awk and sed which i tried but i can;t define something that parse all header like takes into account CP
as well as LN
.
Any suggestion or help would be highly appreciated .
here is my file
fasta shell
$endgroup$
add a comment |
$begingroup$
I have to split this fasta files into smaller files and write them into individual files my files
>lcl|CP000522.1_prot_ABO13860.1_1 [locus_tag=A1S_3471] [protein=hypothetical protein] [protein_id=ABO13860.1] [location=1..957] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13850.1_2 [locus_tag=A1S_3461] [protein=DNA replication protein] [protein_id=ABO13850.1] [location=950..1504] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13851.1_3 [locus_tag=A1S_3462] [protein=hypothetical protein] [protein_id=ABO13851.1] [location=complement(2523..3437)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13852.1_4 [locus_tag=A1S_3463] [protein=YPPCP.09C-like protein] [protein_id=ABO13852.1] [location=3538..4788] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13853.1_5 [locus_tag=A1S_3464] [protein=Cro-like protein] [protein_id=ABO13853.1] [location=5039..5629] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13854.1_6 [locus_tag=A1S_3465] [protein=hypothetical protein] [protein_id=ABO13854.1] [location=complement(6340..6906)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13855.1_7 [locus_tag=A1S_3466] [protein=Resolvase] [protein_id=ABO13855.1] [location=complement(7074..7685)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13856.1_8 [locus_tag=A1S_3467] [protein=hypothetical protein] [protein_id=ABO13856.1] [location=complement(8602..9732)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13857.1_9 [locus_tag=A1S_3468] [protein=putative lipoprotein] [protein_id=ABO13857.1] [location=complement(10072..10374)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13858.1_10 [locus_tag=A1S_3469] [protein=Diaminopimelate decarboxylase] [protein_id=ABO13858.1] [location=complement(10367..10723)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13859.1_11 [locus_tag=A1S_3470] [protein=regulatory protein LysR] [protein_id=ABO13859.1] [location=complement(12076..12444)] [gbkey=CDS]
The other pattern is
>lcl|CP000523.1_prot_ABO13861.1_1 [locus_tag=A1S_3472] [protein=DNA replication protein] [protein_id=ABO13861.1] [location=1..951] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13862.1_2 [locus_tag=A1S_3473] [protein=hypothetical protein] [protein_id=ABO13862.1] [location=3048..4262] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13863.1_3 [locus_tag=A1S_3474] [protein=hypothetical protein] [protein_id=ABO13863.1] [location=4357..5133] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13864.1_4 [locus_tag=A1S_3475] [protein=hypothetical protein] [protein_id=ABO13864.1] [location=6197..8608] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13865.1_5 [locus_tag=A1S_3476] [protein=secretory lipase] [protein_id=ABO13865.1] [location=8705..9403] [gbkey=CDS]
So now my idea is how do i parse and write them into individual files such that CP000522 output written to one file and CP000523 written to another file so forth and so on.
So far what i understand is i have to match the pattern after >lcl
so there are other patterns like "LN997847
" in the file
Not sure how to proceed tried it in R but failed
it can be done with awk and sed which i tried but i can;t define something that parse all header like takes into account CP
as well as LN
.
Any suggestion or help would be highly appreciated .
here is my file
fasta shell
$endgroup$
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago
add a comment |
$begingroup$
I have to split this fasta files into smaller files and write them into individual files my files
>lcl|CP000522.1_prot_ABO13860.1_1 [locus_tag=A1S_3471] [protein=hypothetical protein] [protein_id=ABO13860.1] [location=1..957] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13850.1_2 [locus_tag=A1S_3461] [protein=DNA replication protein] [protein_id=ABO13850.1] [location=950..1504] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13851.1_3 [locus_tag=A1S_3462] [protein=hypothetical protein] [protein_id=ABO13851.1] [location=complement(2523..3437)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13852.1_4 [locus_tag=A1S_3463] [protein=YPPCP.09C-like protein] [protein_id=ABO13852.1] [location=3538..4788] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13853.1_5 [locus_tag=A1S_3464] [protein=Cro-like protein] [protein_id=ABO13853.1] [location=5039..5629] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13854.1_6 [locus_tag=A1S_3465] [protein=hypothetical protein] [protein_id=ABO13854.1] [location=complement(6340..6906)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13855.1_7 [locus_tag=A1S_3466] [protein=Resolvase] [protein_id=ABO13855.1] [location=complement(7074..7685)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13856.1_8 [locus_tag=A1S_3467] [protein=hypothetical protein] [protein_id=ABO13856.1] [location=complement(8602..9732)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13857.1_9 [locus_tag=A1S_3468] [protein=putative lipoprotein] [protein_id=ABO13857.1] [location=complement(10072..10374)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13858.1_10 [locus_tag=A1S_3469] [protein=Diaminopimelate decarboxylase] [protein_id=ABO13858.1] [location=complement(10367..10723)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13859.1_11 [locus_tag=A1S_3470] [protein=regulatory protein LysR] [protein_id=ABO13859.1] [location=complement(12076..12444)] [gbkey=CDS]
The other pattern is
>lcl|CP000523.1_prot_ABO13861.1_1 [locus_tag=A1S_3472] [protein=DNA replication protein] [protein_id=ABO13861.1] [location=1..951] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13862.1_2 [locus_tag=A1S_3473] [protein=hypothetical protein] [protein_id=ABO13862.1] [location=3048..4262] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13863.1_3 [locus_tag=A1S_3474] [protein=hypothetical protein] [protein_id=ABO13863.1] [location=4357..5133] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13864.1_4 [locus_tag=A1S_3475] [protein=hypothetical protein] [protein_id=ABO13864.1] [location=6197..8608] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13865.1_5 [locus_tag=A1S_3476] [protein=secretory lipase] [protein_id=ABO13865.1] [location=8705..9403] [gbkey=CDS]
So now my idea is how do i parse and write them into individual files such that CP000522 output written to one file and CP000523 written to another file so forth and so on.
So far what i understand is i have to match the pattern after >lcl
so there are other patterns like "LN997847
" in the file
Not sure how to proceed tried it in R but failed
it can be done with awk and sed which i tried but i can;t define something that parse all header like takes into account CP
as well as LN
.
Any suggestion or help would be highly appreciated .
here is my file
fasta shell
$endgroup$
I have to split this fasta files into smaller files and write them into individual files my files
>lcl|CP000522.1_prot_ABO13860.1_1 [locus_tag=A1S_3471] [protein=hypothetical protein] [protein_id=ABO13860.1] [location=1..957] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13850.1_2 [locus_tag=A1S_3461] [protein=DNA replication protein] [protein_id=ABO13850.1] [location=950..1504] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13851.1_3 [locus_tag=A1S_3462] [protein=hypothetical protein] [protein_id=ABO13851.1] [location=complement(2523..3437)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13852.1_4 [locus_tag=A1S_3463] [protein=YPPCP.09C-like protein] [protein_id=ABO13852.1] [location=3538..4788] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13853.1_5 [locus_tag=A1S_3464] [protein=Cro-like protein] [protein_id=ABO13853.1] [location=5039..5629] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13854.1_6 [locus_tag=A1S_3465] [protein=hypothetical protein] [protein_id=ABO13854.1] [location=complement(6340..6906)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13855.1_7 [locus_tag=A1S_3466] [protein=Resolvase] [protein_id=ABO13855.1] [location=complement(7074..7685)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13856.1_8 [locus_tag=A1S_3467] [protein=hypothetical protein] [protein_id=ABO13856.1] [location=complement(8602..9732)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13857.1_9 [locus_tag=A1S_3468] [protein=putative lipoprotein] [protein_id=ABO13857.1] [location=complement(10072..10374)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13858.1_10 [locus_tag=A1S_3469] [protein=Diaminopimelate decarboxylase] [protein_id=ABO13858.1] [location=complement(10367..10723)] [gbkey=CDS]
>lcl|CP000522.1_prot_ABO13859.1_11 [locus_tag=A1S_3470] [protein=regulatory protein LysR] [protein_id=ABO13859.1] [location=complement(12076..12444)] [gbkey=CDS]
The other pattern is
>lcl|CP000523.1_prot_ABO13861.1_1 [locus_tag=A1S_3472] [protein=DNA replication protein] [protein_id=ABO13861.1] [location=1..951] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13862.1_2 [locus_tag=A1S_3473] [protein=hypothetical protein] [protein_id=ABO13862.1] [location=3048..4262] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13863.1_3 [locus_tag=A1S_3474] [protein=hypothetical protein] [protein_id=ABO13863.1] [location=4357..5133] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13864.1_4 [locus_tag=A1S_3475] [protein=hypothetical protein] [protein_id=ABO13864.1] [location=6197..8608] [gbkey=CDS]
>lcl|CP000523.1_prot_ABO13865.1_5 [locus_tag=A1S_3476] [protein=secretory lipase] [protein_id=ABO13865.1] [location=8705..9403] [gbkey=CDS]
So now my idea is how do i parse and write them into individual files such that CP000522 output written to one file and CP000523 written to another file so forth and so on.
So far what i understand is i have to match the pattern after >lcl
so there are other patterns like "LN997847
" in the file
Not sure how to proceed tried it in R but failed
it can be done with awk and sed which i tried but i can;t define something that parse all header like takes into account CP
as well as LN
.
Any suggestion or help would be highly appreciated .
here is my file
fasta shell
fasta shell
asked 2 days ago
krushnach Chandrakrushnach Chandra
43119
43119
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago
add a comment |
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago
add a comment |
1 Answer
1
active
oldest
votes
$begingroup$
Here's a simple awk approach:
awk '{if(/^>/){split($1,a,"[|.]")}print >> a[2]".fa"}' Protein_FASTA.txt
Or, more concisely, just:
awk '/^>/{split($1,a,"[|.]")}{print >> a[2]".fa"}' Protein_FASTA.txt
When run on the file linked to in your question, that results in the following files:
$ ls
AP014650.fa CP003848.fa CP007713.fa CP012005.fa CP015122.fa CP017645.fa CP018422.fa CP020594.fa CP023021.fa CP024577.fa CP026712.fa CP027245.fa CP030108.fa
CP000522.fa CP003850.fa CP007714.fa CP012007.fa CP015365.fa CP017647.fa CP018678.fa CP020596.fa CP023023.fa CP024578.fa CP026748.fa CP027529.fa CP030109.fa
CP000523.fa CP003887.fa CP008707.fa CP012008.fa CP015366.fa CP017649.fa CP018679.fa CP021322.fa CP023024.fa CP025267.fa CP026749.fa CP027531.fa CU459137.fa
CP000864.fa CP003888.fa CP008708.fa CP012953.fa CP015484.fa CP017651.fa CP019218.fa CP021327.fa CP023025.fa CP026126.fa CP027121.fa CP027532.fa CU459138.fa
CP000865.fa CP003907.fa CP008709.fa CP012954.fa CP015485.fa CP017653.fa CP020573.fa CP021348.fa CP023027.fa CP026127.fa CP027122.fa CP027608.fa CU459139.fa
CP001183.fa CP003908.fa CP008850.fa CP012955.fa CP015486.fa CP017655.fa CP020575.fa CP021783.fa CP023028.fa CP026128.fa CP027124.fa CP027609.fa CU459140.fa
CP001922.fa CP003968.fa CP008851.fa CP012956.fa CP016296.fa CP017657.fa CP020576.fa CP021784.fa CP023030.fa CP026129.fa CP027179.fa CP027610.fa JN377410.fa
CP001923.fa CP004359.fa CP010398.fa CP013925.fa CP016297.fa CP018144.fa CP020577.fa CP021785.fa CP023032.fa CP026339.fa CP027180.fa CP029570.fa LN865144.fa
CP001938.fa CP006769.fa CP010399.fa CP014216.fa CP016299.fa CP018255.fa CP020580.fa CP021786.fa CP023033.fa CP026340.fa CP027181.fa CP029571.fa LN997847.fa
CP002523.fa CP007578.fa CP010400.fa CP014217.fa CP016301.fa CP018257.fa CP020585.fa CP021787.fa CP023035.fa CP026705.fa CP027182.fa CP029572.fa LT594096.fa
CP002524.fa CP007579.fa CP010780.fa CP014292.fa CP016302.fa CP018333.fa CP020589.fa CP022284.fa CP024125.fa CP026706.fa CP027243.fa CP029573.fa Protein_FASTA.txt
CP003501.fa CP007580.fa CP010782.fa CP014293.fa CP017643.fa CP018334.fa CP020593.fa CP022285.fa CP024419.fa CP026708.fa CP027244.fa CP030107.fa
Explanation
if(/^>/){split($1,a,"[|.]")
: if this line starts with a>
, split the first field on any occurrence of either|
or.
and save the results in the arraya
. Since your header lines all start with>lcl|
, then the string you are looking for and a.
, this means that the second value in thea
array will be your target string.
print >> a[2]".fa"
: print (append,>>
) the current line to a file called "whatever the name of this sequence is" (a[2]
) and.fa
. This is run for every line in your input file. Note that if you run the same command again, you will need to first delete the files created the first time. If you don't, because I am using the>>
, you will just append to the existing files.
$endgroup$
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the_
because the OP seemed to want the text before the version (the.N
) to be the file name. I'd have used the_
myself too.
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need>>
, not>
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end:perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
I suggest replacing theif (…){…}
block with the more awk-ish/^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just{print >> a[2] ".fa"}
.
$endgroup$
– Konrad Rudolph
2 days ago
|
show 7 more comments
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
});
});
}, "mathjax-editing");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "676"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fbioinformatics.stackexchange.com%2fquestions%2f7273%2fsplitting-fasta-file-into-smaller-files-based-on-header-pattern%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
$begingroup$
Here's a simple awk approach:
awk '{if(/^>/){split($1,a,"[|.]")}print >> a[2]".fa"}' Protein_FASTA.txt
Or, more concisely, just:
awk '/^>/{split($1,a,"[|.]")}{print >> a[2]".fa"}' Protein_FASTA.txt
When run on the file linked to in your question, that results in the following files:
$ ls
AP014650.fa CP003848.fa CP007713.fa CP012005.fa CP015122.fa CP017645.fa CP018422.fa CP020594.fa CP023021.fa CP024577.fa CP026712.fa CP027245.fa CP030108.fa
CP000522.fa CP003850.fa CP007714.fa CP012007.fa CP015365.fa CP017647.fa CP018678.fa CP020596.fa CP023023.fa CP024578.fa CP026748.fa CP027529.fa CP030109.fa
CP000523.fa CP003887.fa CP008707.fa CP012008.fa CP015366.fa CP017649.fa CP018679.fa CP021322.fa CP023024.fa CP025267.fa CP026749.fa CP027531.fa CU459137.fa
CP000864.fa CP003888.fa CP008708.fa CP012953.fa CP015484.fa CP017651.fa CP019218.fa CP021327.fa CP023025.fa CP026126.fa CP027121.fa CP027532.fa CU459138.fa
CP000865.fa CP003907.fa CP008709.fa CP012954.fa CP015485.fa CP017653.fa CP020573.fa CP021348.fa CP023027.fa CP026127.fa CP027122.fa CP027608.fa CU459139.fa
CP001183.fa CP003908.fa CP008850.fa CP012955.fa CP015486.fa CP017655.fa CP020575.fa CP021783.fa CP023028.fa CP026128.fa CP027124.fa CP027609.fa CU459140.fa
CP001922.fa CP003968.fa CP008851.fa CP012956.fa CP016296.fa CP017657.fa CP020576.fa CP021784.fa CP023030.fa CP026129.fa CP027179.fa CP027610.fa JN377410.fa
CP001923.fa CP004359.fa CP010398.fa CP013925.fa CP016297.fa CP018144.fa CP020577.fa CP021785.fa CP023032.fa CP026339.fa CP027180.fa CP029570.fa LN865144.fa
CP001938.fa CP006769.fa CP010399.fa CP014216.fa CP016299.fa CP018255.fa CP020580.fa CP021786.fa CP023033.fa CP026340.fa CP027181.fa CP029571.fa LN997847.fa
CP002523.fa CP007578.fa CP010400.fa CP014217.fa CP016301.fa CP018257.fa CP020585.fa CP021787.fa CP023035.fa CP026705.fa CP027182.fa CP029572.fa LT594096.fa
CP002524.fa CP007579.fa CP010780.fa CP014292.fa CP016302.fa CP018333.fa CP020589.fa CP022284.fa CP024125.fa CP026706.fa CP027243.fa CP029573.fa Protein_FASTA.txt
CP003501.fa CP007580.fa CP010782.fa CP014293.fa CP017643.fa CP018334.fa CP020593.fa CP022285.fa CP024419.fa CP026708.fa CP027244.fa CP030107.fa
Explanation
if(/^>/){split($1,a,"[|.]")
: if this line starts with a>
, split the first field on any occurrence of either|
or.
and save the results in the arraya
. Since your header lines all start with>lcl|
, then the string you are looking for and a.
, this means that the second value in thea
array will be your target string.
print >> a[2]".fa"
: print (append,>>
) the current line to a file called "whatever the name of this sequence is" (a[2]
) and.fa
. This is run for every line in your input file. Note that if you run the same command again, you will need to first delete the files created the first time. If you don't, because I am using the>>
, you will just append to the existing files.
$endgroup$
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the_
because the OP seemed to want the text before the version (the.N
) to be the file name. I'd have used the_
myself too.
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need>>
, not>
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end:perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
I suggest replacing theif (…){…}
block with the more awk-ish/^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just{print >> a[2] ".fa"}
.
$endgroup$
– Konrad Rudolph
2 days ago
|
show 7 more comments
$begingroup$
Here's a simple awk approach:
awk '{if(/^>/){split($1,a,"[|.]")}print >> a[2]".fa"}' Protein_FASTA.txt
Or, more concisely, just:
awk '/^>/{split($1,a,"[|.]")}{print >> a[2]".fa"}' Protein_FASTA.txt
When run on the file linked to in your question, that results in the following files:
$ ls
AP014650.fa CP003848.fa CP007713.fa CP012005.fa CP015122.fa CP017645.fa CP018422.fa CP020594.fa CP023021.fa CP024577.fa CP026712.fa CP027245.fa CP030108.fa
CP000522.fa CP003850.fa CP007714.fa CP012007.fa CP015365.fa CP017647.fa CP018678.fa CP020596.fa CP023023.fa CP024578.fa CP026748.fa CP027529.fa CP030109.fa
CP000523.fa CP003887.fa CP008707.fa CP012008.fa CP015366.fa CP017649.fa CP018679.fa CP021322.fa CP023024.fa CP025267.fa CP026749.fa CP027531.fa CU459137.fa
CP000864.fa CP003888.fa CP008708.fa CP012953.fa CP015484.fa CP017651.fa CP019218.fa CP021327.fa CP023025.fa CP026126.fa CP027121.fa CP027532.fa CU459138.fa
CP000865.fa CP003907.fa CP008709.fa CP012954.fa CP015485.fa CP017653.fa CP020573.fa CP021348.fa CP023027.fa CP026127.fa CP027122.fa CP027608.fa CU459139.fa
CP001183.fa CP003908.fa CP008850.fa CP012955.fa CP015486.fa CP017655.fa CP020575.fa CP021783.fa CP023028.fa CP026128.fa CP027124.fa CP027609.fa CU459140.fa
CP001922.fa CP003968.fa CP008851.fa CP012956.fa CP016296.fa CP017657.fa CP020576.fa CP021784.fa CP023030.fa CP026129.fa CP027179.fa CP027610.fa JN377410.fa
CP001923.fa CP004359.fa CP010398.fa CP013925.fa CP016297.fa CP018144.fa CP020577.fa CP021785.fa CP023032.fa CP026339.fa CP027180.fa CP029570.fa LN865144.fa
CP001938.fa CP006769.fa CP010399.fa CP014216.fa CP016299.fa CP018255.fa CP020580.fa CP021786.fa CP023033.fa CP026340.fa CP027181.fa CP029571.fa LN997847.fa
CP002523.fa CP007578.fa CP010400.fa CP014217.fa CP016301.fa CP018257.fa CP020585.fa CP021787.fa CP023035.fa CP026705.fa CP027182.fa CP029572.fa LT594096.fa
CP002524.fa CP007579.fa CP010780.fa CP014292.fa CP016302.fa CP018333.fa CP020589.fa CP022284.fa CP024125.fa CP026706.fa CP027243.fa CP029573.fa Protein_FASTA.txt
CP003501.fa CP007580.fa CP010782.fa CP014293.fa CP017643.fa CP018334.fa CP020593.fa CP022285.fa CP024419.fa CP026708.fa CP027244.fa CP030107.fa
Explanation
if(/^>/){split($1,a,"[|.]")
: if this line starts with a>
, split the first field on any occurrence of either|
or.
and save the results in the arraya
. Since your header lines all start with>lcl|
, then the string you are looking for and a.
, this means that the second value in thea
array will be your target string.
print >> a[2]".fa"
: print (append,>>
) the current line to a file called "whatever the name of this sequence is" (a[2]
) and.fa
. This is run for every line in your input file. Note that if you run the same command again, you will need to first delete the files created the first time. If you don't, because I am using the>>
, you will just append to the existing files.
$endgroup$
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the_
because the OP seemed to want the text before the version (the.N
) to be the file name. I'd have used the_
myself too.
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need>>
, not>
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end:perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
I suggest replacing theif (…){…}
block with the more awk-ish/^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just{print >> a[2] ".fa"}
.
$endgroup$
– Konrad Rudolph
2 days ago
|
show 7 more comments
$begingroup$
Here's a simple awk approach:
awk '{if(/^>/){split($1,a,"[|.]")}print >> a[2]".fa"}' Protein_FASTA.txt
Or, more concisely, just:
awk '/^>/{split($1,a,"[|.]")}{print >> a[2]".fa"}' Protein_FASTA.txt
When run on the file linked to in your question, that results in the following files:
$ ls
AP014650.fa CP003848.fa CP007713.fa CP012005.fa CP015122.fa CP017645.fa CP018422.fa CP020594.fa CP023021.fa CP024577.fa CP026712.fa CP027245.fa CP030108.fa
CP000522.fa CP003850.fa CP007714.fa CP012007.fa CP015365.fa CP017647.fa CP018678.fa CP020596.fa CP023023.fa CP024578.fa CP026748.fa CP027529.fa CP030109.fa
CP000523.fa CP003887.fa CP008707.fa CP012008.fa CP015366.fa CP017649.fa CP018679.fa CP021322.fa CP023024.fa CP025267.fa CP026749.fa CP027531.fa CU459137.fa
CP000864.fa CP003888.fa CP008708.fa CP012953.fa CP015484.fa CP017651.fa CP019218.fa CP021327.fa CP023025.fa CP026126.fa CP027121.fa CP027532.fa CU459138.fa
CP000865.fa CP003907.fa CP008709.fa CP012954.fa CP015485.fa CP017653.fa CP020573.fa CP021348.fa CP023027.fa CP026127.fa CP027122.fa CP027608.fa CU459139.fa
CP001183.fa CP003908.fa CP008850.fa CP012955.fa CP015486.fa CP017655.fa CP020575.fa CP021783.fa CP023028.fa CP026128.fa CP027124.fa CP027609.fa CU459140.fa
CP001922.fa CP003968.fa CP008851.fa CP012956.fa CP016296.fa CP017657.fa CP020576.fa CP021784.fa CP023030.fa CP026129.fa CP027179.fa CP027610.fa JN377410.fa
CP001923.fa CP004359.fa CP010398.fa CP013925.fa CP016297.fa CP018144.fa CP020577.fa CP021785.fa CP023032.fa CP026339.fa CP027180.fa CP029570.fa LN865144.fa
CP001938.fa CP006769.fa CP010399.fa CP014216.fa CP016299.fa CP018255.fa CP020580.fa CP021786.fa CP023033.fa CP026340.fa CP027181.fa CP029571.fa LN997847.fa
CP002523.fa CP007578.fa CP010400.fa CP014217.fa CP016301.fa CP018257.fa CP020585.fa CP021787.fa CP023035.fa CP026705.fa CP027182.fa CP029572.fa LT594096.fa
CP002524.fa CP007579.fa CP010780.fa CP014292.fa CP016302.fa CP018333.fa CP020589.fa CP022284.fa CP024125.fa CP026706.fa CP027243.fa CP029573.fa Protein_FASTA.txt
CP003501.fa CP007580.fa CP010782.fa CP014293.fa CP017643.fa CP018334.fa CP020593.fa CP022285.fa CP024419.fa CP026708.fa CP027244.fa CP030107.fa
Explanation
if(/^>/){split($1,a,"[|.]")
: if this line starts with a>
, split the first field on any occurrence of either|
or.
and save the results in the arraya
. Since your header lines all start with>lcl|
, then the string you are looking for and a.
, this means that the second value in thea
array will be your target string.
print >> a[2]".fa"
: print (append,>>
) the current line to a file called "whatever the name of this sequence is" (a[2]
) and.fa
. This is run for every line in your input file. Note that if you run the same command again, you will need to first delete the files created the first time. If you don't, because I am using the>>
, you will just append to the existing files.
$endgroup$
Here's a simple awk approach:
awk '{if(/^>/){split($1,a,"[|.]")}print >> a[2]".fa"}' Protein_FASTA.txt
Or, more concisely, just:
awk '/^>/{split($1,a,"[|.]")}{print >> a[2]".fa"}' Protein_FASTA.txt
When run on the file linked to in your question, that results in the following files:
$ ls
AP014650.fa CP003848.fa CP007713.fa CP012005.fa CP015122.fa CP017645.fa CP018422.fa CP020594.fa CP023021.fa CP024577.fa CP026712.fa CP027245.fa CP030108.fa
CP000522.fa CP003850.fa CP007714.fa CP012007.fa CP015365.fa CP017647.fa CP018678.fa CP020596.fa CP023023.fa CP024578.fa CP026748.fa CP027529.fa CP030109.fa
CP000523.fa CP003887.fa CP008707.fa CP012008.fa CP015366.fa CP017649.fa CP018679.fa CP021322.fa CP023024.fa CP025267.fa CP026749.fa CP027531.fa CU459137.fa
CP000864.fa CP003888.fa CP008708.fa CP012953.fa CP015484.fa CP017651.fa CP019218.fa CP021327.fa CP023025.fa CP026126.fa CP027121.fa CP027532.fa CU459138.fa
CP000865.fa CP003907.fa CP008709.fa CP012954.fa CP015485.fa CP017653.fa CP020573.fa CP021348.fa CP023027.fa CP026127.fa CP027122.fa CP027608.fa CU459139.fa
CP001183.fa CP003908.fa CP008850.fa CP012955.fa CP015486.fa CP017655.fa CP020575.fa CP021783.fa CP023028.fa CP026128.fa CP027124.fa CP027609.fa CU459140.fa
CP001922.fa CP003968.fa CP008851.fa CP012956.fa CP016296.fa CP017657.fa CP020576.fa CP021784.fa CP023030.fa CP026129.fa CP027179.fa CP027610.fa JN377410.fa
CP001923.fa CP004359.fa CP010398.fa CP013925.fa CP016297.fa CP018144.fa CP020577.fa CP021785.fa CP023032.fa CP026339.fa CP027180.fa CP029570.fa LN865144.fa
CP001938.fa CP006769.fa CP010399.fa CP014216.fa CP016299.fa CP018255.fa CP020580.fa CP021786.fa CP023033.fa CP026340.fa CP027181.fa CP029571.fa LN997847.fa
CP002523.fa CP007578.fa CP010400.fa CP014217.fa CP016301.fa CP018257.fa CP020585.fa CP021787.fa CP023035.fa CP026705.fa CP027182.fa CP029572.fa LT594096.fa
CP002524.fa CP007579.fa CP010780.fa CP014292.fa CP016302.fa CP018333.fa CP020589.fa CP022284.fa CP024125.fa CP026706.fa CP027243.fa CP029573.fa Protein_FASTA.txt
CP003501.fa CP007580.fa CP010782.fa CP014293.fa CP017643.fa CP018334.fa CP020593.fa CP022285.fa CP024419.fa CP026708.fa CP027244.fa CP030107.fa
Explanation
if(/^>/){split($1,a,"[|.]")
: if this line starts with a>
, split the first field on any occurrence of either|
or.
and save the results in the arraya
. Since your header lines all start with>lcl|
, then the string you are looking for and a.
, this means that the second value in thea
array will be your target string.
print >> a[2]".fa"
: print (append,>>
) the current line to a file called "whatever the name of this sequence is" (a[2]
) and.fa
. This is run for every line in your input file. Note that if you run the same command again, you will need to first delete the files created the first time. If you don't, because I am using the>>
, you will just append to the existing files.
edited 2 days ago
answered 2 days ago
terdon♦terdon
4,5502830
4,5502830
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the_
because the OP seemed to want the text before the version (the.N
) to be the file name. I'd have used the_
myself too.
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need>>
, not>
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end:perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
I suggest replacing theif (…){…}
block with the more awk-ish/^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just{print >> a[2] ".fa"}
.
$endgroup$
– Konrad Rudolph
2 days ago
|
show 7 more comments
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the_
because the OP seemed to want the text before the version (the.N
) to be the file name. I'd have used the_
myself too.
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need>>
, not>
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end:perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
$begingroup$
I suggest replacing theif (…){…}
block with the more awk-ish/^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just{print >> a[2] ".fa"}
.
$endgroup$
– Konrad Rudolph
2 days ago
1
1
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
$begingroup$
Personally, I would have used a "_" along with "." and "|", but who am I. The small advantage of a script is that if its rerun it will clobber the old run output, without fear of appending.
$endgroup$
– Michael G.
2 days ago
2
2
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:
perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the _
because the OP seemed to want the text before the version (the .N
) to be the file name. I'd have used the _
myself too.$endgroup$
– terdon♦
2 days ago
$begingroup$
@MichaelG. you can do it easily enough in perl, it's just slightly more cumbersome since you need to explicitly open and close each file:
perl -ne 'if(/>.+?|(.*?)./){$name=$1; }open(my $fh, ">>","$name.fa"); print $fh "$_"; close($fh)' Protein_FASTA.txt
. I didn't include the _
because the OP seemed to want the text before the version (the .N
) to be the file name. I'd have used the _
myself too.$endgroup$
– terdon♦
2 days ago
1
1
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
$begingroup$
wow so many solutions...let me run this..
$endgroup$
– krushnach Chandra
2 days ago
1
1
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need
>>
, not >
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end: perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
$begingroup$
You're welcome :). @MichaelG. note that I had a mistake in the first version of that perl one-liner. You need
>>
, not >
(I've edited the comment) and it still has the same issue with existing files. The only way around that is to store everything in memory and only write at the end: perl -ne 'if(/>.+?|(.*?)./){$name=$1;}push @{$k{$name}},$_; END{for $name (keys(%k)){open(my $fh, ">","$name.fa"); print $fh @{$k{$name}}; close($fh)}}' Protein_FASTA.txt
$endgroup$
– terdon♦
2 days ago
1
1
$begingroup$
I suggest replacing the
if (…){…}
block with the more awk-ish /^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just {print >> a[2] ".fa"}
.$endgroup$
– Konrad Rudolph
2 days ago
$begingroup$
I suggest replacing the
if (…){…}
block with the more awk-ish /^>/ {split($1, a, "[|.]")}
, which simplified the actual print look to just {print >> a[2] ".fa"}
.$endgroup$
– Konrad Rudolph
2 days ago
|
show 7 more comments
Thanks for contributing an answer to Bioinformatics Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fbioinformatics.stackexchange.com%2fquestions%2f7273%2fsplitting-fasta-file-into-smaller-files-based-on-header-pattern%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
$begingroup$
This is very easy with a bit of scripting one script will do it all. We do have R experts here (best solution)....Otherwise I'll post the code. How do you want each file named (important no matter which approach you take)?
$endgroup$
– Michael G.
2 days ago