On Thu, 5 Jun 2008, citromatik wrote:
>
> Hi all,
>
> I'm trying to write a function for reading fasta sequences from a file (one
> sequence per call).
> For those who are not familiar with this format, here is a brief
> description:
> The fasta format has a first line beginning with a ">" symbol, following a
> description of the sequence. The rest of the lines are the sequence itself.
> Here is an example with 2 fasta records:
> > sequence1
> ACAGCGACTAGCTAGC
> CAGCCGCGGACGCAAT
> ACGAGACGATGCGCAG
> ACCC
> > sequence2
> TTTTGCAGCGGCAATA
> GCAGCGAGCAGCACTA
> GCACACA
>
> I want to be able to say:
>
> let fasta_stream_of_channel fh = Stream.from fun_to_read_fasta;;
> let () = Stream.iter f (fasta_stream_of_channel datafile);;
>
> or at least:
>
> let operation_on_fasta channel f =
> try
> let faseq = fun_to_read_fasta channel in
> f faseq
> with
> (* Some exception "End_of_file"*)
> ; ;
>
> I'm interested in the "fun_to_read_fasta" part.
>
> This is my code so far (ugly and with a couple of bugs):
>
> type fasta = {mutable header:string; mutable sequence:string};;
>
> let header = ref "";;
> let seq = ref "";;
> let readseq fhh =
> let faseq = {header="";sequence=""} in
> let rec aux fa_fh =
> try
> let line = input_line fa_fh in
> if String.contains line '>' then
> begin
> if !header = "" then
> begin
> header := line;
> aux fa_fh;
> end
> else
> begin
> faseq.header <- !header;
> faseq.sequence <- !seq;
> seq := "";
> header := line;
> faseq
> end
> end
> else
> begin
> seq := !seq ^ line;
> aux fa_fh
> end
> with
> End_of_file -> if !seq = "" then faseq
> else
> begin
> faseq.header <- !header;
> faseq.sequence <- !seq;
> seq := "";
> faseq
> end
> in aux fhh
> ; ;
>
> Any help would be very much welcome,
>
> Thank you very much in advance,
>
> M;
Here is some not-so-old code that I dug from a program.
I don't have much time right now to clean it up, so I'll leave it as an
exercise 
You need to know a few things:
- it uses the micmatch syntax extension for testing the type of line (RE
...). You really don't have to.
- the function takes in in_channel and returns a function (loop). This
function returns Some (name, seq) or None if we are at the end of the
channel.
- the channel is not closed at the end (the code that is responsible for
opening it should also be responsible for closing it)
let get_prot ?(maxi = max_int) ic =
let pepid = ref "" in
let buf = Buffer.create 1000 in
let counter = ref 0 in
let rec loop () =
let try s = input_line ic in
match s with
RE ">" "tcag|"? (digit+ as new_pepid) ->
let name = !pepid in
let seq = Buffer.contents buf in
pepid := new_pepid;
Buffer.clear buf;
if seq <> "" then
(incr counter;
if !counter mod 10_000 = 0 then
printf "processed %i sequencesn%!" !counter;
if !counter < maxi then
Some (name, seq)
else None)
else
loop ()
| _ ->
Buffer.add_string buf s;
loop ()
with End_of_file ->
let name = !pepid in
let seq = Buffer.contents buf in
pepid := "";
Buffer.clear buf;
if seq <> "" then
(incr counter;
Some (name, seq))
else
(printf "total: %i sequencesn%!" !counter;
None) in
loop
--
http: //wink.com/profile/mjambon
http: //mjambon.com
.