[Shootout-list] OCaml k-nucleotide

Christophe TROESTLER del-con@tiscali.be
Sat, 26 Mar 2005 22:34:00 +0100 (CET)


----Next_Part(Sat_Mar_26_22_34_00_2005_490)--
Content-Type: Text/Plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Hi,

Here is an OCaml implementation of k-nucleotide.

ChriS

----Next_Part(Sat_Mar_26_22_34_00_2005_490)--
Content-Type: Text/Plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="k-nucleotide.ml"

(* k-nucleotide.ml
 *
 * The Great Computer Language Shootout
 * http://shootout.alioth.debian.org/
 *
 * Contributed by Troestler Christophe
 *)

open Printf
module H = Hashtbl

let count = H.create 100000
let counts k dna =
  H.clear count;
  for i = 0 to String.length dna - k do
    let key = String.sub dna i k in
    try let c = H.find count key in H.replace count key (c+1)
    with Not_found -> H.add count key 1
  done;
  count

let compare_freq ((k:string),(f:float)) (k', f') =
  if f > f' then -1 else if f < f' then 1 else compare k k'

let write_frequencies k dna =
  let cnt = counts k dna in
  let tot = float(H.fold (fun _ n t -> n + t) cnt 0) in
  let frq = H.fold (fun k n l -> (k, (100. *.(float n)/. tot)) :: l) cnt [] in
  let frq = List.sort compare_freq frq in
  List.iter (fun (k,f) -> printf "%s %.2f\n" k f) frq;
  print_string "\n"

let write_count seq dna =
  let cnt = counts (String.length seq) dna in
  printf "%d\t%s\n" (try H.find cnt seq with Not_found -> 0) seq


let dna_three =
  let is_three s = try String.sub s 0 6 = ">THREE" with _ -> false in
  while not(is_three(input_line stdin)) do () done;
  let buf = Buffer.create 1000 in
  (try while true do
     Buffer.add_string buf (String.uppercase(input_line stdin))
   done with End_of_file -> ());
  Buffer.contents buf

let () =
  write_frequencies 1 dna_three;
  write_frequencies 2 dna_three;
  List.iter (fun k -> write_count k dna_three)
    ["GGT"; "GGTA"; "GGTATT"; "GGTATTTTAATT"; "GGTATTTTAATTTATAGT"]

----Next_Part(Sat_Mar_26_22_34_00_2005_490)----