(* EXERCISE 
   ===============================================================
   Implement in ocaml the following grammar

     S -1-> ""
     S -2-> "a" . S . "b"

   The language generated by this grammar is { a^b^n | n in Nat } 

   This grammar is not ambigous since, given n in Nat,
   there is only one way to derive a^nb^n.
   =============================================================== *)


(* We explain in XXX steps how to build powerful and elegant parsers *)

(* STEP -1-  A very specific parser
  
   A parser is a function that reads the input and return a boolean (recognized or not?) 
   together with the part of the input that has not been scan.
*)

let rec (anbn_parser1: char list -> bool * char list) = 
  fun sequence ->
	match sequence with
	| [] -> (true,sequence)
	| 'a' :: tail_sequence -> 
		let (b2,sequence2) = anbn_parser1 tail_sequence
		in
		  (match sequence2 with
		  | 'b':: tail_sequence2 -> (b2, tail_sequence2)
		  | _ -> (false,sequence2)
		  )
	| _  -> (true,sequence)


(* TESTING FUNCTIONS *)

let rec (sequence_from_string: string -> char list) = 
  fun str -> 
	if str = ""
	then []
	else str.[0] :: (sequence_from_string (String.sub str 1 ((String.length str) - 1)))


let (is_recognized_by: (char list -> bool * char list) -> string -> bool) = 
  fun parse word ->
	begin
	  print_string (String.concat "" [ "\n parser \"" ; word ; "\" = \n"]) ;
	  let (bool,sequence) = parse (sequence_from_string word)
	  in bool && (sequence = [])
	end
;;

(* TEST parser1 *)

is_recognized_by anbn_parser1 "" ;;
is_recognized_by anbn_parser1 "ab" ;;
is_recognized_by anbn_parser1 "aabb" ;;
is_recognized_by anbn_parser1 "ba" ;;


(* STEP -2- Generalization step I 

   We reuse the principle of parser1 but, in order to generalize
   the previous code we do not exploit the fact that the input is a char list:
   We use two functions: first, tail and is_empty instead of matching on a list.
*)

let (is_empty: char list -> bool) = 
  function
    | [] -> true 
    | _ -> false


let (first: char list -> char option) = 
  function
    | []   -> None
    | c::_ -> Some c


let (tail: char list -> char list) = 
  function
    | [] -> []
    | _::others -> others


let rec (anbn_parser2: char list -> bool * char list) = 
  fun sequence ->
	match first sequence with
	| None -> (true,sequence)
	| Some 'a' -> 
		let (b2,sequence2) = anbn_parser2 (tail sequence)
		in let 
		    (b3,sequence3) = (match first sequence2 with
                                    | Some 'b' -> (true , tail sequence2)
                                    | _        -> (false, sequence2)
				   )
		in (b2 && b3,sequence3)
	| _ -> (true, sequence)
;;		  

(* TEST anbn_parser2 *)

is_recognized_by anbn_parser2 "" ;;
is_recognized_by anbn_parser2 "ab" ;;
is_recognized_by anbn_parser2 "aabb" ;;
is_recognized_by anbn_parser2 "ba" ;;


(* -3- Generalization of the input format

   We can use any kind of input sequence as long as it provides the functions: first, tail, is_empty

*)

module Sequence = 
  (struct

    type sequence =  char Stream.t

    type t = sequence

    let (first: sequence -> char option) = fun stream ->
	  Stream.peek stream 
	    
    let (tail: sequence -> sequence) = fun stream -> 
	  begin
	    Stream.junk stream ; stream 
	  end
	    
    let (is_empty: sequence -> bool) = fun stream ->
	  Stream.peek stream = None
	    
    let (from_string: string -> sequence) = fun string ->
	  Stream.of_string string

    let rec (to_list: sequence -> 'x list) = fun sequence ->
	  match first sequence with
	  | Some x -> x::(to_list (tail sequence))
	  | None -> []

    let (from_list: 'x list -> sequence) = Stream.of_list ;;

  end)


module Sequence = 
  (struct 
    type sequence = char list
    type t = sequence

    let (is_empty: sequence -> bool) = 
      function
	| [] -> true 
	| _ -> false

    let (first: sequence -> char option) = 
      function
	| []   -> None
	| c::_ -> Some c


    let (tail: sequence -> sequence) = 
      function
	| [] -> []
	| _::others -> others

    let rec (from_string: string -> sequence) = 
      fun  str -> 
	    if str = ""
	    then []
	    else str.[0] :: (from_string (String.sub str 1 ((String.length str) - 1)))
  end)


let rec (anbn_parser3: Sequence.t -> bool * Sequence.t) = 
  fun sequence ->
	match Sequence.first sequence with
	| None -> (true,sequence)
	| Some 'a' -> 
		let (b2,sequence2) = anbn_parser3 (Sequence.tail sequence)
		in let 
		    (b3,sequence3) = (match Sequence.first sequence2 with
                                     | Some 'b' -> (true , Sequence.tail sequence2)
                                     | _        -> (false, sequence2)
				   )
		in (b2 && b3,sequence3)
	| _ -> (true, sequence)


(* -5- Generalization step IV -- the parser type
   
   Whatever the grammar, a word is recognized if 
     1) the parser returns "true" 
     2) all the sequence has been scan.

   The first condition is specific to the grammar, whereas the second
   condition does not depend on the grammar. So, instead of writing condition (2)
   in the parser, we check it in an external function * is_recognized_by * that runs the parser
   on a string which is converted into sequence.
   
   The * parse *  function takes as parameters:
     1. a parser to run
     2. a string 
   and returns a boolean 

   We define the type parser as:      *)


type parser = Sequence.t -> bool * Sequence.t

let (is_recognized_by: parser -> string -> bool) = 
  fun parse word ->
	begin
	  print_string (String.concat "" [ "\n parser \"" ; word ; "\" = \n"]) ;
	  let (bool,sequence) = parse (Sequence.from_string word)
	  in bool && (Sequence.is_empty sequence)
	end
;;

(* TEST anbn_parser3 *)

is_recognized_by anbn_parser3 "" ;;
is_recognized_by anbn_parser3 "ab" ;;
is_recognized_by anbn_parser3 "aabb" ;;
is_recognized_by anbn_parser3 "ba" ;;


(* -4- Generalization step III -- a parser builder
   
   We do the same case study as before with the help of an auxilliary function * parse_letter *
   It is not a parser but a constructor of parser: it takes a letter then,
   it takes an sequence and return the unscan part of the sequence and a boolean that tells if the first 
   element of the sequence is the expected letter. 

   With the help of * parse_letter *, the code of * anbn_parser * is greatly simplified.
   
*)

let (parse_letter: string -> (Sequence.t -> bool * Sequence.t)) = 
  fun letter -> 
	(fun sequence ->
	      if letter = "" 
	      then (true,sequence)
	      else
		match Sequence.first sequence with
		| Some c -> (letter = String.make 1 c, Sequence.tail sequence)
		| None   -> (false, sequence)
	)

let rec (anbn_parser4: Sequence.t -> bool * Sequence.t) = 
  fun sequence ->
	match Sequence.first sequence with
	| None -> (true,sequence)
	| Some 'a' ->
		let 
		    (b1,sequence1) = parse_letter "a" sequence
		in let 
		    (b2,sequence2) = anbn_parser4 sequence1
		in let 
		    (b3,sequence3) = parse_letter "b" sequence2
		in 
		  (b1 && b2 && b3, sequence3)
	| _ -> (true,sequence)
;;

(* TEST of parser4 *)

is_recognized_by anbn_parser4 "" ;;
is_recognized_by anbn_parser4 "ab" ;;
is_recognized_by anbn_parser4 "aabb" ;;
is_recognized_by anbn_parser4 "ba" ;;


(* -6- Generalization step V -- using combinator of parsers 

   The version 4 of the anbn parser runs three parsers one after the other:

     parse_letter "a" <then> anbn_parser4 <then> parse_letter "b"

   We would like to define a <then> operator that is able of managing
   the sequential runs of two parsers. The previous sequence of three parsers
   could then be obtained by two applications of <then>, as below:
   
    (parse_letter "a" <then> anbn_parser4) <then> parse_letter "b"
   
   The <then> operators should take as parameters two parsers 
   and should return a parser (in order to be used by the next <then> operator)
       which is a function that takes an sequence and returns a couple sequence * boolean

*)

let (then_operator: parser -> parser -> parser) = 
  fun parser1 parser2 ->
	(fun sequence ->
	      let (b1,sequence1) = parser1 sequence in
		if not (b1)
		then (false,sequence)
		else parser2 sequence1 
	)

(* We introduce an infix notation for the then_operator to allow 
   writing   (p1 <.> p2)   instead of   (then_operator p1 p2)
   and we can get rid of parenthesis and write
        p1 <.> p2 <.> p3   instead of   ((p1 <.> p2) <.> p3)
*)

let (<.>) p1 p2 = then_operator p1 p2 

let rec (anbn_parser5: Sequence.t -> bool * Sequence.t) = 
  fun sequence ->
	match Sequence.first sequence with
	| None -> (true,sequence)
	| Some 'a' -> ( (parse_letter "a") <.> anbn_parser5 <.> (parse_letter "b") ) sequence
	| _ -> (true,sequence)
;;


(* TEST of parser5 *)

is_recognized_by anbn_parser5 "" ;;
is_recognized_by anbn_parser5 "ab" ;;
is_recognized_by anbn_parser5 "aabb" ;;
is_recognized_by anbn_parser5 "ba" ;;


let rec (anbn_parser6: Sequence.t -> bool * Sequence.t) = 
  fun sequence ->
	match Sequence.first sequence with
	| Some 'a' -> ( (parse_letter "a") <.> anbn_parser6 <.> (parse_letter "b") ) sequence
	| _        -> (parse_letter "") sequence
;;


(* TEST of parser6 *)

is_recognized_by anbn_parser6 "" ;;
is_recognized_by anbn_parser6 "ab" ;;
is_recognized_by anbn_parser6 "aabb" ;;
is_recognized_by anbn_parser6 "ba" ;;


(* -7- Generalization step VI -- using more combinators of parsers 

   The empty word (epsilon = "") is part of the anbn langage so, the
   anbn parser6 still have to test if the sequence is the empty word
   before switching to the other possibility. In the spirit of the
   <then> operator, this switch could be done as an <else> operator
   combining two parsers:

      parse_epsilon <else> ((parse_letter "a") <.> anbn_parser6 <.> (parse_letter "b"))

*)

(* BE CAREFUL WITH STREAMS: a backup is needed to cancel side-effect in streams

let (else_operator: parser -> parser -> parser) = 
  fun parser1 parser2 ->
	(fun sequence ->
	      let backup = Sequence.to_list sequence 
	      in
	      let (b1,sequence') = parser1 (Sequence.from_list backup) in 
		if b1 
		then (true,sequence')
		else 
		  let (b2,sequence') = parser2 (Sequence.from_list backup) in
		    if b2 
		    then (true ,sequence')
		    else (false, Sequence.from_list backup)
	)
*)
 
let (else_operator: parser -> parser -> parser) = 
  fun parser1 parser2 ->
	(fun sequence ->
	      let (b1,sequence') = parser1 sequence in
		if b1 
		then (true,sequence')
		else 
		  let (b2,sequence') = parser2 sequence in
		    if b2 
		    then (true ,sequence')
		    else (false,sequence)
	)


(* We introduce an infix notation for the then_operator to allow 
   writing   (r1 <|> p2)   instead of   (else_operator p1 p2)
*)

let (<|>) p1 p2 = else_operator p1 p2 


let rec (anbn_parser: Sequence.t -> bool * Sequence.t) = fun sequence ->
   ( 
     ( (parse_letter "a") <.> anbn_parser <.> (parse_letter "b") )
       <|>
     (parse_letter "")
  ) 
  sequence
;;


(* TEST of parser *)

is_recognized_by anbn_parser "" ;;
is_recognized_by anbn_parser "ab" ;;
is_recognized_by anbn_parser "aabb" ;;
is_recognized_by anbn_parser "ba" ;;

type parser = Sequence.t -> bool * Sequence.t

type 't extractor = Sequence.t -> bool * 't * Sequence.t

let (extend_parser_with: parser -> 'result -> 'result extractor) =
  fun parser result -> 
	(fun sequence ->
	      let (b,sequence') = parser sequence
	      in (b,result,sequence')
	)


let (-->) parser result = extend_parser_with parser result ;;

let (apply_on_extractor: 'r extractor -> ('r -> 'r ) -> 'r extractor) = 
  fun extractor f ->
	(fun sequence ->
	      let (b,r,sequence') = extractor sequence
	      in (b,f r,sequence')
	)

let (==>) extractor f = apply_on_extractor extractor f ;;


let (then_operator: ('r list) extractor -> ('r list) extractor -> ('r list) extractor) = 
  fun extractor1 extractor2 ->
	(fun sequence ->
	      let (b1,rs1,sequence1) = extractor1 sequence in
		if not (b1)
		then (false,[],sequence)
		else 
		  let (b2,rs2,sequence2) = extractor2 sequence1 
		  in (b1 && b2, rs1 @ rs2, sequence2)
	)

let (<.>) ex1 ex2 = then_operator ex1 ex2 ;;

let (else_operator: ('r list) extractor -> ('r list) extractor -> ('r list) extractor) = 
  fun extractor1 extractor2 ->
	(fun sequence ->
	      let (b1,rs1,sequence') = extractor1 sequence in
		if b1 
		then (true,rs1,sequence')
		else 
		  let (b2,rs2,sequence') = extractor2 sequence in
		    if b2 
		    then (true , rs2, sequence')
		    else (false, [] , sequence)
	)

let (<|>) ex1 ex2 = else_operator ex1 ex2 ;;


let (sum: int list -> int list) = fun ns -> [ List.fold_left (+) 0 ns ] ;;

let rec (anbn_extractor: Sequence.t -> bool * int list * Sequence.t) = fun sequence ->
 (
    ( ( ((parse_letter "a") --> [1]) <.> anbn_extractor <.> ((parse_letter "b") --> [0]) ) ==> sum)
   <|>
    ((parse_letter "") --> [0])
 )
  sequence
;;
      

let (run: 'result extractor -> string -> bool * 'result) = 
  fun extract word ->
	begin
	  print_string (String.concat "" [ "\n extractor \"" ; word ; "\" = \n"]) ;
	  let (bool,rs,sequence) = extract (Sequence.from_string word)
	  in (bool && (Sequence.is_empty sequence), rs)
	end
;;


(* TEST of parser *)

run anbn_extractor "" ;;
run anbn_extractor "ab" ;;
run anbn_extractor "aabb" ;;
run anbn_extractor "ba" ;;


(* Derivation tree *)


(* Ambigous grammar *)


(* -8- Generalization step VII -- curryfication

   Thanks to curryfication: We could get rid of the argument * sequence * and just write:


let rec (anbn_parser: Sequence.t -> bool * Sequence.t) = 
  ( (parse_letter "a") <.> anbn_parser <.> (parse_letter "b") )
    <|>
  (parse_letter "")
;;

*)