[Shootout-list] Ada word-frequency

Marius Amado Alves amado.alves@netcabo.pt
Thu, 24 Mar 2005 02:23:41 +0000


-- The Great Computer Language Shootout
-- http://shootout.alioth.debian.org/
--
-- contributed by Marius Amado Alves
--
-- gnatmake -O3 -gnatp word_frequency.adb

with Ada.Streams;
with Ada.Streams.Stream_IO;
with Ada.Text_IO;
with Ada.Text_IO.Text_Streams;

procedure Word_Frequency is

    use Ada.Streams;

    Buffer : Stream_Element_Array (1 .. 4096);
    Input_Stream : Ada.Text_IO.Text_Streams.Stream_Access
      := Ada.Text_IO.Text_Streams.Stream (Ada.Text_IO.Current_Input);
    N : Stream_Element_Offset;
    Is_Separator : array (Stream_Element) of Boolean :=
      (Character'Pos ('A') .. Character'Pos ('Z') |
       Character'Pos ('a') .. Character'Pos ('z') => False,
       others => True);

    -- N-ary tree of word counts
    -- used to increment the counts in one pass of the input file
    -- branches on the letter
    -- carries the count
    -- very fast
    -- but very space consuming

    subtype Letter is Stream_Element range
       Character'Pos ('a') .. Character'Pos ('z');
    type Word is array (Positive range <>) of Letter;
    type Tree;
    type Tree_Ptr is access Tree;
    type Node is
       record
          Count : Natural := 0;
          Subtree : Tree_Ptr := null;
       end record;
    type Tree is array (Letter) of Node;

    procedure Inc (X : in out Integer) is begin X := X + 1; end;
    procedure Dec (X : in out Integer) is begin X := X - 1; end;
    pragma Inline (Inc, Dec);

    procedure Inc_Word (Parent : Tree_Ptr; Descendents : Word) is
    begin
       if Descendents'Length > 0 then
          declare
             Child_Index : Positive := Descendents'First;
             Child : Letter renames Descendents (Child_Index);
          begin
             if Descendents'Length = 1 then
                Inc (Parent (Child).Count);
             else
                if Parent (Child).Subtree = null then
                   Parent (Child).Subtree := new Tree;
                end if;
                Inc_Word
                  (Parent (Child).Subtree,
                   Descendents (Child_Index + 1 .. Descendents'Last));
             end if;
          end;
       end if;
    end;

    -- Binary tree of word counts
    -- used for sorting the result by the count (frequency)
    -- branches on the word count
    -- carries the word form

    type Form_Ptr is access Word;
    type Binary_Tree;
    type Binary_Tree_Ptr is access Binary_Tree;
    type Binary_Tree is
       record
          Form : Form_Ptr;
          Count : Natural;
          Left, Right : Binary_Tree_Ptr;
       end record;

    procedure Add_Node (Parent : in out Binary_Tree_Ptr; Form : 
Form_Ptr; Count : Natural) is
    begin
       if Parent = null then
          Parent := new Binary_Tree;
          Parent.Form := Form;
          Parent.Count := Count;
       else
          if Count < Parent.Count then
             Add_Node (Parent.Left, Form, Count);
          else
             Add_Node (Parent.Right, Form, Count);
          end if;
       end if;
    end;

    -- end of binary tree primitives

    Root : Tree_Ptr := new Tree;
    Btree : Binary_Tree_Ptr := null;
    Current_Word : Word (1 .. 1000);
    Current_Word_Length : Natural range 0 .. Current_Word'Last := 0;
    In_Word : Boolean := False;

    procedure Append_To_Word (E : Letter) is
    begin
       Inc (Current_Word_Length);
       Current_Word (Current_Word_Length) := E;
       In_Word := True;
    end;

    procedure End_Word is
    begin
       if Current_Word_Length > 0 then
          Inc_Word (Root, Current_Word (1 .. Current_Word_Length));
       end if;
       Current_Word_Length := 0;
       In_Word := False;
    end;

    To_Lower : array (Stream_Element) of Letter;

    procedure Initialise_To_Lower_Map is
       D : Integer := Character'Pos ('a') - Character'Pos ('A');
    begin
       for I in Character'Pos ('a') .. Character'Pos ('z') loop
          To_Lower (Stream_Element (I)) := Letter (I);
          To_Lower (Stream_Element (I - D)) := Letter (I);
       end loop;
    end;

    procedure Process (S : Stream_Element_Array) is
    begin
       for I in S'Range loop
          if Is_Separator (S (I)) then
             if In_Word then End_Word; end if;
          else
             Append_To_Word (To_Lower (S (I)));
          end if;
       end loop;
    end;

    pragma Inline (Append_To_Word, End_Word, Process);

    procedure Populate_Btree (Ntree : Tree_Ptr) is
    begin
       Inc (Current_Word_Length);
       for I in Letter'Range loop
          Current_Word (Current_Word_Length) := I;
          if Ntree (I).Count > 0 then
             Add_Node
                (Btree,
                 Form => new Word'(Current_Word (1 .. 
Current_Word_Length)),
                 Count => Ntree (I).Count);
          end if;
          if Ntree (I).Subtree /= null then
             Populate_Btree (Ntree (I).Subtree);
          end if;
       end loop;
       Dec (Current_Word_Length);
    end;

    procedure Populate_Btree is
    begin
       Current_Word_Length := 0;
       Populate_Btree (Root);
    end;

    function To_String (X : Form_Ptr) return String is
       S : String (X'Range);
    begin
       for I in X'Range loop
          S (I) := Character'Val (X (I));
       end loop;
       return S;
    end;

    subtype String7 is String (1 .. 7);

    function Img7 (X : Natural) return String7 is
       S : String := Natural'Image (X);
    begin
       return String' (1 .. 8 - S'Length => ' ') & S (2 .. S'Last);
    end;

    pragma Inline (To_String, Img7);

    procedure Dump_Btree (X : Binary_Tree_Ptr := Btree) is
    begin
       if X /= null then
          Dump_Btree (X.Right);
          Ada.Text_IO.Put_Line
            (Img7 (X.Count) & " " & To_String (X.Form));
          Dump_Btree (X.Left);
       end if;
    end;

begin
    Initialise_To_Lower_Map;
    loop
       Read (Root_Stream_Type'Class (Input_Stream.all), Buffer, N);
       Process (Buffer (1 .. N));
       exit when N < Buffer'Length;
    end loop;
    if In_Word then End_Word; end if;
    Populate_Btree;
    Dump_Btree;
end;