(* EXERCISE =============================================================== Implement in ocaml the following grammar S -1-> "" S -2-> "a" . S . "b" The language generated by this grammar is { a^b^n | n in Nat } This grammar is not ambigous since, given n in Nat, there is only one way to derive a^nb^n. =============================================================== *) (* We explain in XXX steps how to build powerful and elegant parsers *) (* STEP -1- A very specific parser A parser is a function that reads the input and return a boolean (recognized or not?) together with the part of the input that has not been scan. *) let rec (anbn_parser1: char list -> bool * char list) = fun sequence -> match sequence with | [] -> (true,sequence) | 'a' :: tail_sequence -> let (b2,sequence2) = anbn_parser1 tail_sequence in (match sequence2 with | 'b':: tail_sequence2 -> (b2, tail_sequence2) | _ -> (false,sequence2) ) | _ -> (true,sequence) (* TESTING FUNCTIONS *) let rec (sequence_from_string: string -> char list) = fun str -> if str = "" then [] else str.[0] :: (sequence_from_string (String.sub str 1 ((String.length str) - 1))) let (is_recognized_by: (char list -> bool * char list) -> string -> bool) = fun parse word -> begin print_string (String.concat "" [ "\n parser \"" ; word ; "\" = \n"]) ; let (bool,sequence) = parse (sequence_from_string word) in bool && (sequence = []) end ;; (* TEST parser1 *) is_recognized_by anbn_parser1 "" ;; is_recognized_by anbn_parser1 "ab" ;; is_recognized_by anbn_parser1 "aabb" ;; is_recognized_by anbn_parser1 "ba" ;; (* STEP -2- Generalization step I We reuse the principle of parser1 but, in order to generalize the previous code we do not exploit the fact that the input is a char list: We use two functions: first, tail and is_empty instead of matching on a list. *) let (is_empty: char list -> bool) = function | [] -> true | _ -> false let (first: char list -> char option) = function | [] -> None | c::_ -> Some c let (tail: char list -> char list) = function | [] -> [] | _::others -> others let rec (anbn_parser2: char list -> bool * char list) = fun sequence -> match first sequence with | None -> (true,sequence) | Some 'a' -> let (b2,sequence2) = anbn_parser2 (tail sequence) in let (b3,sequence3) = (match first sequence2 with | Some 'b' -> (true , tail sequence2) | _ -> (false, sequence2) ) in (b2 && b3,sequence3) | _ -> (true, sequence) ;; (* TEST anbn_parser2 *) is_recognized_by anbn_parser2 "" ;; is_recognized_by anbn_parser2 "ab" ;; is_recognized_by anbn_parser2 "aabb" ;; is_recognized_by anbn_parser2 "ba" ;; (* -3- Generalization of the input format We can use any kind of input sequence as long as it provides the functions: first, tail, is_empty *) module Sequence = (struct type sequence = char Stream.t type t = sequence let (first: sequence -> char option) = fun stream -> Stream.peek stream let (tail: sequence -> sequence) = fun stream -> begin Stream.junk stream ; stream end let (is_empty: sequence -> bool) = fun stream -> Stream.peek stream = None let (from_string: string -> sequence) = fun string -> Stream.of_string string let rec (to_list: sequence -> 'x list) = fun sequence -> match first sequence with | Some x -> x::(to_list (tail sequence)) | None -> [] let (from_list: 'x list -> sequence) = Stream.of_list ;; end) module Sequence = (struct type sequence = char list type t = sequence let (is_empty: sequence -> bool) = function | [] -> true | _ -> false let (first: sequence -> char option) = function | [] -> None | c::_ -> Some c let (tail: sequence -> sequence) = function | [] -> [] | _::others -> others let rec (from_string: string -> sequence) = fun str -> if str = "" then [] else str.[0] :: (from_string (String.sub str 1 ((String.length str) - 1))) end) let rec (anbn_parser3: Sequence.t -> bool * Sequence.t) = fun sequence -> match Sequence.first sequence with | None -> (true,sequence) | Some 'a' -> let (b2,sequence2) = anbn_parser3 (Sequence.tail sequence) in let (b3,sequence3) = (match Sequence.first sequence2 with | Some 'b' -> (true , Sequence.tail sequence2) | _ -> (false, sequence2) ) in (b2 && b3,sequence3) | _ -> (true, sequence) (* -5- Generalization step IV -- the parser type Whatever the grammar, a word is recognized if 1) the parser returns "true" 2) all the sequence has been scan. The first condition is specific to the grammar, whereas the second condition does not depend on the grammar. So, instead of writing condition (2) in the parser, we check it in an external function * is_recognized_by * that runs the parser on a string which is converted into sequence. The * parse * function takes as parameters: 1. a parser to run 2. a string and returns a boolean We define the type parser as: *) type parser = Sequence.t -> bool * Sequence.t let (is_recognized_by: parser -> string -> bool) = fun parse word -> begin print_string (String.concat "" [ "\n parser \"" ; word ; "\" = \n"]) ; let (bool,sequence) = parse (Sequence.from_string word) in bool && (Sequence.is_empty sequence) end ;; (* TEST anbn_parser3 *) is_recognized_by anbn_parser3 "" ;; is_recognized_by anbn_parser3 "ab" ;; is_recognized_by anbn_parser3 "aabb" ;; is_recognized_by anbn_parser3 "ba" ;; (* -4- Generalization step III -- a parser builder We do the same case study as before with the help of an auxilliary function * parse_letter * It is not a parser but a constructor of parser: it takes a letter then, it takes an sequence and return the unscan part of the sequence and a boolean that tells if the first element of the sequence is the expected letter. With the help of * parse_letter *, the code of * anbn_parser * is greatly simplified. *) let (parse_letter: string -> (Sequence.t -> bool * Sequence.t)) = fun letter -> (fun sequence -> if letter = "" then (true,sequence) else match Sequence.first sequence with | Some c -> (letter = String.make 1 c, Sequence.tail sequence) | None -> (false, sequence) ) let rec (anbn_parser4: Sequence.t -> bool * Sequence.t) = fun sequence -> match Sequence.first sequence with | None -> (true,sequence) | Some 'a' -> let (b1,sequence1) = parse_letter "a" sequence in let (b2,sequence2) = anbn_parser4 sequence1 in let (b3,sequence3) = parse_letter "b" sequence2 in (b1 && b2 && b3, sequence3) | _ -> (true,sequence) ;; (* TEST of parser4 *) is_recognized_by anbn_parser4 "" ;; is_recognized_by anbn_parser4 "ab" ;; is_recognized_by anbn_parser4 "aabb" ;; is_recognized_by anbn_parser4 "ba" ;; (* -6- Generalization step V -- using combinator of parsers The version 4 of the anbn parser runs three parsers one after the other: parse_letter "a" anbn_parser4 parse_letter "b" We would like to define a operator that is able of managing the sequential runs of two parsers. The previous sequence of three parsers could then be obtained by two applications of , as below: (parse_letter "a" anbn_parser4) parse_letter "b" The operators should take as parameters two parsers and should return a parser (in order to be used by the next operator) which is a function that takes an sequence and returns a couple sequence * boolean *) let (then_operator: parser -> parser -> parser) = fun parser1 parser2 -> (fun sequence -> let (b1,sequence1) = parser1 sequence in if not (b1) then (false,sequence) else parser2 sequence1 ) (* We introduce an infix notation for the then_operator to allow writing (p1 <.> p2) instead of (then_operator p1 p2) and we can get rid of parenthesis and write p1 <.> p2 <.> p3 instead of ((p1 <.> p2) <.> p3) *) let (<.>) p1 p2 = then_operator p1 p2 let rec (anbn_parser5: Sequence.t -> bool * Sequence.t) = fun sequence -> match Sequence.first sequence with | None -> (true,sequence) | Some 'a' -> ( (parse_letter "a") <.> anbn_parser5 <.> (parse_letter "b") ) sequence | _ -> (true,sequence) ;; (* TEST of parser5 *) is_recognized_by anbn_parser5 "" ;; is_recognized_by anbn_parser5 "ab" ;; is_recognized_by anbn_parser5 "aabb" ;; is_recognized_by anbn_parser5 "ba" ;; let rec (anbn_parser6: Sequence.t -> bool * Sequence.t) = fun sequence -> match Sequence.first sequence with | Some 'a' -> ( (parse_letter "a") <.> anbn_parser6 <.> (parse_letter "b") ) sequence | _ -> (parse_letter "") sequence ;; (* TEST of parser6 *) is_recognized_by anbn_parser6 "" ;; is_recognized_by anbn_parser6 "ab" ;; is_recognized_by anbn_parser6 "aabb" ;; is_recognized_by anbn_parser6 "ba" ;; (* -7- Generalization step VI -- using more combinators of parsers The empty word (epsilon = "") is part of the anbn langage so, the anbn parser6 still have to test if the sequence is the empty word before switching to the other possibility. In the spirit of the operator, this switch could be done as an operator combining two parsers: parse_epsilon ((parse_letter "a") <.> anbn_parser6 <.> (parse_letter "b")) *) (* BE CAREFUL WITH STREAMS: a backup is needed to cancel side-effect in streams let (else_operator: parser -> parser -> parser) = fun parser1 parser2 -> (fun sequence -> let backup = Sequence.to_list sequence in let (b1,sequence') = parser1 (Sequence.from_list backup) in if b1 then (true,sequence') else let (b2,sequence') = parser2 (Sequence.from_list backup) in if b2 then (true ,sequence') else (false, Sequence.from_list backup) ) *) let (else_operator: parser -> parser -> parser) = fun parser1 parser2 -> (fun sequence -> let (b1,sequence') = parser1 sequence in if b1 then (true,sequence') else let (b2,sequence') = parser2 sequence in if b2 then (true ,sequence') else (false,sequence) ) (* We introduce an infix notation for the then_operator to allow writing (r1 <|> p2) instead of (else_operator p1 p2) *) let (<|>) p1 p2 = else_operator p1 p2 let rec (anbn_parser: Sequence.t -> bool * Sequence.t) = fun sequence -> ( ( (parse_letter "a") <.> anbn_parser <.> (parse_letter "b") ) <|> (parse_letter "") ) sequence ;; (* TEST of parser *) is_recognized_by anbn_parser "" ;; is_recognized_by anbn_parser "ab" ;; is_recognized_by anbn_parser "aabb" ;; is_recognized_by anbn_parser "ba" ;; type parser = Sequence.t -> bool * Sequence.t type 't extractor = Sequence.t -> bool * 't * Sequence.t let (extend_parser_with: parser -> 'result -> 'result extractor) = fun parser result -> (fun sequence -> let (b,sequence') = parser sequence in (b,result,sequence') ) let (-->) parser result = extend_parser_with parser result ;; let (apply_on_extractor: 'r extractor -> ('r -> 'r ) -> 'r extractor) = fun extractor f -> (fun sequence -> let (b,r,sequence') = extractor sequence in (b,f r,sequence') ) let (==>) extractor f = apply_on_extractor extractor f ;; let (then_operator: ('r list) extractor -> ('r list) extractor -> ('r list) extractor) = fun extractor1 extractor2 -> (fun sequence -> let (b1,rs1,sequence1) = extractor1 sequence in if not (b1) then (false,[],sequence) else let (b2,rs2,sequence2) = extractor2 sequence1 in (b1 && b2, rs1 @ rs2, sequence2) ) let (<.>) ex1 ex2 = then_operator ex1 ex2 ;; let (else_operator: ('r list) extractor -> ('r list) extractor -> ('r list) extractor) = fun extractor1 extractor2 -> (fun sequence -> let (b1,rs1,sequence') = extractor1 sequence in if b1 then (true,rs1,sequence') else let (b2,rs2,sequence') = extractor2 sequence in if b2 then (true , rs2, sequence') else (false, [] , sequence) ) let (<|>) ex1 ex2 = else_operator ex1 ex2 ;; let (sum: int list -> int list) = fun ns -> [ List.fold_left (+) 0 ns ] ;; let rec (anbn_extractor: Sequence.t -> bool * int list * Sequence.t) = fun sequence -> ( ( ( ((parse_letter "a") --> [1]) <.> anbn_extractor <.> ((parse_letter "b") --> [0]) ) ==> sum) <|> ((parse_letter "") --> [0]) ) sequence ;; let (run: 'result extractor -> string -> bool * 'result) = fun extract word -> begin print_string (String.concat "" [ "\n extractor \"" ; word ; "\" = \n"]) ; let (bool,rs,sequence) = extract (Sequence.from_string word) in (bool && (Sequence.is_empty sequence), rs) end ;; (* TEST of parser *) run anbn_extractor "" ;; run anbn_extractor "ab" ;; run anbn_extractor "aabb" ;; run anbn_extractor "ba" ;; (* Derivation tree *) (* Ambigous grammar *) (* -8- Generalization step VII -- curryfication Thanks to curryfication: We could get rid of the argument * sequence * and just write: let rec (anbn_parser: Sequence.t -> bool * Sequence.t) = ( (parse_letter "a") <.> anbn_parser <.> (parse_letter "b") ) <|> (parse_letter "") ;; *)