%--------------------------------------------------% % vim: ft=mercury ts=4 sw=4 et %--------------------------------------------------% % Copyright (C) 2009-2012 The University of Melbourne. % Copyright (C) 2014-2020, 2022 The Mercury team. % This file is distributed under the terms specified in COPYING.LIB. %--------------------------------------------------% % % File: parsing_utils.m % Authors: Ralph Becket <rafe@csse.unimelb.edu.au>, maclarty % Stability: low % % Utilities for recursive descent parsers. Parsers take at least three % arguments: a source (src) containing the input string, and an input/output % pair of parser states (ps) tracking the current offset into the input. % % Call parse(InputString, SkipWS, Parser, Result) to parse an input string % and return an error context and message if parsing failed. % The SkipWS predicate is used by the primitive parsers to skip over any % following whitespace (providing a skipping predicate allows users to define % comments as whitespace). % Alternatively, a new src and ps can be constructed by calling % new_src_and_ps(InputString, SkipWS, Src, !:PS). % % Parsing predicates are semidet and typically take the form % p(...parameters..., Src, Result, !PS). A parser matching variable % assignments of the form `x = 42' might be defined like this: % % var_assignment(Src, {Var, Value}, !PS) :- % var(Src, Var, !PS), % punct(Src, "=", !PS), % expr(Src, Expr, !PS). % % where var/4 and expr/4 are parsers for variables and expressions respectively % and punct/4 is provided by this module for matching punctuation. % %--------------------------------------------------% %--------------------------------------------------% :- module parsing_utils. :- interface. :- import_module char. :- import_module list. :- import_module maybe. :- import_module unit. %--------------------------------------------------% % The parser source (input string). % :- type src. % The parser "state", passed around in DCG arguments. % :- type ps. % These types and insts are useful for specifying "standard" parser % signatures. % :- type parser(T) == pred(src, T, ps, ps). :- inst parser == (pred(in, out, in, out) is semidet). % The following are for parsers that also transform a separate state value. % :- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps). :- inst parser_with_state == (pred(in, out, in, out, in, out) is semidet). % Predicates of this type are used to skip whitespace in the primitive % parsers provided by this module. % :- type skip_whitespace_pred == parser(unit). :- type parse_result(T) ---> ok(T) ; error( error_message :: maybe(string), error_line :: int, error_col :: int ). % parse(Input, SkipWS, Parser, Result). % Try to parse Input using Parser and SkipWS to consume whitespace. % If Parser succeeds then return ok with the parsed value, % otherwise return error. If there were any calls to fail_with_message % without any subsequent progress being made, then the error message % passed to the last call to fail_with_message will be returned in the % error result. Otherwise no message is returned and the furthest % position the parser got in the input string is returned. % :- pred parse(string::in, skip_whitespace_pred::in(parser), parser(T)::in(parser), parse_result(T)::out) is cc_multi. % As above but using the default whitespace parser, whitespace/4. % :- pred parse(string::in, parser(T)::in(parser), parse_result(T)::out) is cc_multi. %--------------------------------------------------% % Construct a new parser source and state from a string, also specifying % a predicate for skipping over whitespace (several primitive parsers % use this predicate to consume whitespace after a token; this argument % allows the user to specify a predicate for, say, skipping over comments % as well). % :- pred new_src_and_ps(string::in, skip_whitespace_pred::in(parser), src::out, ps::out) is det. % Construct a new parser source and state from a string. % The default whitespace parser, whitespace/4, is used. % :- pred new_src_and_ps(string::in, src::out, ps::out) is det. %--------------------------------------------------% % Return the input string and its length from the parser source. % :- pred input_string(src::in, string::out, int::out) is det. % Obtain the current offset from the start of the input string % (the first character in the input has offset 0). % :- pred current_offset(src::in, int::out, ps::in, ps::out) is det. % Return the parser to skip over whitespace from the parser source. % :- pred get_skip_whitespace_pred(src::in, skip_whitespace_pred::out(parser)) is det. %--------------------------------------------------% % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring): % Copy the substring from the input occupying the offsets % [StartOffset, EndOffsetPlusOne). % :- pred input_substring(src::in, int::in, int::in, string::out) is semidet. %--------------------------------------------------% :- type line_numbers. % Compute a structure from the parser source which can be used to % convert offsets into line numbers and positions in the file (this % is useful for error reporting). % :- func src_to_line_numbers(src) = line_numbers. % Convert an offset into a line number and position within the line % (the first line is number 1; the first character in a line is % position 1). % :- pred offset_to_line_number_and_position(line_numbers::in, int::in, int::out, int::out) is det. %--------------------------------------------------% % Read the next char. % :- pred next_char(src::in, char::out, ps::in, ps::out) is semidet. % Read the next char but do not record progress information. % This is more efficient than next_char, but may produce less informative % error messages in case of a parse error. % :- pred next_char_no_progress(src::in, char::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Match a char from the given string. % :- pred char_in_class(string::in, src::in, char::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Match a string exactly and any subsequent whitespace. % :- pred punct(string::in, src::in, unit::out, ps::in, ps::out) is semidet. % keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly (i.e., it % must not be followed by any character in IdChars) and any subsequent % whitespace. % :- pred keyword(string::in, string::in, src::in, unit::out, ps::in, ps::out) is semidet. % ikeyword(IdChars, Keyword, Src, _, !PS) % Case-insensitive version of keyword/6. % Only uppercase and lowercase letters in the ASCII range (A-Z, a-z) % are compared case insensitively. % :- pred ikeyword(string::in, string::in, src::in, unit::out, ps::in, ps::out) is semidet. % identifier(InitIdChars, IdChars, Src, Identifier, !PS) matches the next % identifier (result in Identifier) comprising a char from InitIdChars % followed by zero or more chars from IdChars. Any subsequent whitespace % is consumed. % :- pred identifier(string::in, string::in, src::in, string::out, ps::in, ps::out) is semidet. % Consume any whitespace (defined as a sequence of characters % satisfying char.is_whitespace). % :- pred whitespace(src::in, unit::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Consume any input up to, and including, the next newline character % marking the end of the current line. % :- pred skip_to_eol(src::in, unit::out, ps::in, ps::out) is semidet. % Succeed if we have reached the end of the input. % :- pred eof(src::in, unit::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-+][0-9]+)? % followed by any whitespace. The float_literal_as_string version simply % returns the matched string. The float_literal version uses % string.to_float to convert the output of float_literal_as_string; this % may return an approximate answer since not all floating point numbers % can be perfectly represented as Mercury floats. % :- pred float_literal_as_string(src::in, string::out, ps::in, ps::out) is semidet. :- pred float_literal(src::in, float::out, ps::in, ps::out) is semidet. % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+, % followed by any whitespace. The int_literal_as_string version simply % returns the matched string. The int_literal version uses string.to_int % to convert the output of int_literal_as_string; this may fail if the % number in question cannot be represented as a Mercury int. % :- pred int_literal_as_string(src::in, string::out, ps::in, ps::out) is semidet. :- pred int_literal(src::in, int::out, ps::in, ps::out) is semidet. % Parse a string literal. The string argument is the quote character. % A backslash (\) character in the string makes the next character % literal (e.g., for embedding quotes). These 'escaped' characters % are included as-is in the result, along with the preceding backslash. % Any following whitespace is also consumed. % :- pred string_literal(char::in, src::in, string::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Each basic parser combinators has a version that has a separate state % argument is threaded through the computation, for parsers that e.g. % incrementally construct a symbol table. % optional(P, Src, Result, !PS) returns Result = yes(X) if P(Src, X, !PS), % or Result = no if P does not succeed. % :- pred optional(parser(T)::in(parser), src::in, maybe(T)::out, ps::in, ps::out) is semidet. % optional(P, Src, Result, !S, !PS) returns Result = yes(X) % if P(Src, X, !S, !PS), or Result = no if P does not succeed. % :- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in, maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained % by repeatedly applying P until P fails. The nth item in Xs is % the result from the nth application of P. % :- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out, ps::in, ps::out) is semidet. % zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained % by repeatedly applying P until P fails. The nth item in Xs is % the result from the nth application of P. % :- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained % by repeatedly applying P until P fails. The nth item in Xs is % the result from the nth application of P. P must succeed at least once. % :- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out, ps::in, ps::out) is semidet. % one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained % by repeatedly applying P until P fails. The nth item in Xs is % the result from the nth application of P. P must succeed at least once. % :- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % brackets(L, R, P, Src, X, !PS) is equivalent to % punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS). % :- pred brackets(string::in, string::in, parser(T)::in(parser), src::in, T::out, ps::in, ps::out) is semidet. % brackets(L, R, P, Src, X, !S, !PS) is equivalent to % punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS). % :- pred brackets(string::in, string::in, parser_with_state(T, S)::in(parser_with_state), src::in, T::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % separated_list(Separator, P, Src, Xs, !PS) is like % zero_or_more(P, Src, Xs, !PS) except that successive applications of % P must be separated by punct(Separator, Src, _, !PS). % :- pred separated_list(string::in, parser(T)::in(parser), src::in, list(T)::out, ps::in, ps::out) is semidet. % separated_list(Separator, P, Src, Xs, !S, !PS) is like % zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of % P must be separated by punct(Separator, Src, _, !PS). % :- pred separated_list(string::in, parser_with_state(T, S)::in(parser_with_state), src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % comma_separated_list(P, Src, Xs) is the same as % separated_list(",", P, Src, Xs). % :- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out, ps::in, ps::out) is semidet. % comma_separated_list(P, Src, Xs, !S, !PS) is the same as % separated_list(",", P, Src, Xs, !S, !PS). % :- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state), src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet. %--------------------------------------------------% % Declaratively this predicate is equivalent to false. Operationally, % it will record an error message that will be returned by parse/4 % if no further progress is made and then fail. % :- pred fail_with_message(string::in, src::in, T::out, ps::in, ps::out) is semidet. % As above, but use the given offset for the context of the message. % :- pred fail_with_message(string::in, int::in, src::in, T::out, ps::in, ps::out) is semidet. %--------------------------------------------------% %--------------------------------------------------%