%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 2009-2012 The University of Melbourne.
% Copyright (C) 2014-2020, 2022, 2025 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: parsing_utils.m.
% Authors: rafe, maclarty.
% Stability: medium.
%
% Utilities for recursive descent parsers. Parsers take at least three
% arguments: a source (src) containing the input string, and an input/output
% pair of parser states (ps) tracking the current offset into the input.
%
% Call parse(InputString, SkipWS, Parser, Result) to parse an input string
% and return an error context and message if parsing failed.
% The SkipWS predicate is used by the primitive parsers to skip over any
% following whitespace (providing a skipping predicate allows users to define
% comments as whitespace).
% Alternatively, a new src and ps can be constructed by calling
% new_src_and_ps(InputString, SkipWS, Src, !:PS).
%
% Parsing predicates are semidet and typically take the form
% p(...parameters..., Src, Result, !PS). A parser matching variable
% assignments of the form `x = 42' might be defined like this:
%
% var_assignment(Src, {Var, Value}, !PS) :-
% var(Src, Var, !PS),
% punct(Src, "=", !PS),
% expr(Src, Expr, !PS).
%
% where var/4 and expr/4 are parsers for variables and expressions respectively
% and punct/4 is provided by this module for matching punctuation.
%
%--------------------------------------------------%
%--------------------------------------------------%
:- module parsing_utils.
:- interface.
:- import_module char.
:- import_module list.
:- import_module maybe.
:- import_module unit.
%--------------------------------------------------%
% The parser source (input string).
%
:- type src.
% The parser "state", passed around in DCG arguments.
%
:- type ps.
% These types and insts are useful for specifying "standard" parser
% signatures.
%
:- type parser(T) == pred(src, T, ps, ps).
:- inst parser == (pred(in, out, in, out) is semidet).
% The following are for parsers that also transform a separate state value.
%
:- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps).
:- inst parser_with_state == (pred(in, out, in, out, in, out) is semidet).
% Predicates of this type are used to skip whitespace in the primitive
% parsers provided by this module.
%
:- type skip_whitespace_pred == parser(unit).
:- type parse_result(T)
---> ok(T)
; error(
error_message :: maybe(string),
error_line :: int,
error_col :: int
).
% parse(Input, SkipWS, Parser, Result):
%
% Try to parse Input using Parser and SkipWS to consume whitespace.
% If Parser succeeds then return ok with the parsed value,
% otherwise return error. If there were any calls to fail_with_message
% without any subsequent progress being made, then the error message
% passed to the last call to fail_with_message will be returned in the
% error result. Otherwise no message is returned and the furthest
% position the parser got in the input string is returned.
%
:- pred parse(string::in, skip_whitespace_pred::in(parser),
parser(T)::in(parser), parse_result(T)::out) is cc_multi.
% As above but using the default whitespace parser, whitespace/4.
%
:- pred parse(string::in, parser(T)::in(parser), parse_result(T)::out)
is cc_multi.
%--------------------------------------------------%
% Construct a new parser source and state from a string, also specifying
% a predicate for skipping over whitespace (several primitive parsers
% use this predicate to consume whitespace after a token; this argument
% allows the user to specify a predicate for, say, skipping over comments
% as well).
%
:- pred new_src_and_ps(string::in, skip_whitespace_pred::in(parser),
src::out, ps::out) is det.
% Construct a new parser source and state from a string.
% The default whitespace parser, whitespace/4, is used.
%
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.
%--------------------------------------------------%
% Return the input string and its length from the parser source.
%
:- pred input_string(src::in, string::out, int::out) is det.
% Obtain the current offset from the start of the input string
% (the first character in the input has offset 0).
%
:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
% Return the parser to skip over whitespace from the parser source.
%
:- pred get_skip_whitespace_pred(src::in, skip_whitespace_pred::out(parser))
is det.
%--------------------------------------------------%
% input_substring(Src, StartOffset, EndOffsetPlusOne, Substring):
% Copy the substring from the input occupying the offsets
% [StartOffset, EndOffsetPlusOne).
%
:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
%--------------------------------------------------%
:- type line_numbers.
% Compute a structure from the parser source which can be used
% to convert offsets into line numbers and positions in the file
% (this is useful for error reporting).
%
:- func src_to_line_numbers(src) = line_numbers.
% Convert an offset into a line number and position within the line
% (the first line is number 1; the first character in a line is
% position 1).
%
:- pred offset_to_line_number_and_position(line_numbers::in, int::in,
int::out, int::out) is det.
%--------------------------------------------------%
% Read the next char.
%
:- pred next_char(src::in, char::out, ps::in, ps::out) is semidet.
% Read the next char but do not record progress information.
% This is more efficient than next_char, but may produce less informative
% error messages in case of a parse error.
%
:- pred next_char_no_progress(src::in, char::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Match a char from the given string.
%
:- pred char_in_class(string::in, src::in, char::out,
ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Match a string exactly and any subsequent whitespace.
%
:- pred punct(string::in, src::in, unit::out, ps::in, ps::out) is semidet.
% keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly
% (i.e., it must not be followed by any character in IdChars)
% and any subsequent whitespace.
%
:- pred keyword(string::in, string::in, src::in, unit::out,
ps::in, ps::out) is semidet.
% ikeyword(IdChars, Keyword, Src, _, !PS):
%
% Case-insensitive version of keyword/6.
% Only uppercase and lowercase letters in the ASCII range (A-Z, a-z)
% are compared case insensitively.
%
:- pred ikeyword(string::in, string::in, src::in, unit::out,
ps::in, ps::out) is semidet.
% identifier(InitIdChars, IdChars, Src, Identifier, !PS) matches the next
% identifier (result in Identifier) comprising a char from InitIdChars
% followed by zero or more chars from IdChars. Any subsequent whitespace
% is consumed.
%
:- pred identifier(string::in, string::in, src::in, string::out,
ps::in, ps::out) is semidet.
% Consume any whitespace (defined as a sequence of characters
% satisfying char.is_whitespace).
%
:- pred whitespace(src::in, unit::out,
ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Consume any input up to, and including, the next newline character
% marking the end of the current line.
%
:- pred skip_to_eol(src::in, unit::out,
ps::in, ps::out) is semidet.
% Succeed if we have reached the end of the input.
%
:- pred eof(src::in, unit::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-+][0-9]+)?
% followed by any whitespace. The float_literal_as_string version simply
% returns the matched string. The float_literal version uses
% string.to_float to convert the output of float_literal_as_string; this
% may return an approximate answer since not all floating point numbers
% can be perfectly represented as Mercury floats.
%
:- pred float_literal_as_string(src::in, string::out,
ps::in, ps::out) is semidet.
:- pred float_literal(src::in, float::out,
ps::in, ps::out) is semidet.
% Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+,
% followed by any whitespace. The int_literal_as_string version simply
% returns the matched string. The int_literal version uses string.to_int
% to convert the output of int_literal_as_string; this may fail if the
% number in question cannot be represented as a Mercury int.
%
:- pred int_literal_as_string(src::in, string::out,
ps::in, ps::out) is semidet.
:- pred int_literal(src::in, int::out,
ps::in, ps::out) is semidet.
% Parse a string literal. The string argument is the quote character.
% A backslash (\) character in the string makes the next character
% literal (e.g., for embedding quotes). These 'escaped' characters
% are included as-is in the result, along with the preceding backslash.
% Any following whitespace is also consumed.
%
:- pred string_literal(char::in, src::in, string::out,
ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Each basic parser combinators has a version that has a separate state
% argument threaded through the computation, for parsers that e.g.
% incrementally construct a symbol table.
% optional(P, Src, Result, !PS) returns Result = yes(X) if P(Src, X, !PS),
% or Result = no if P does not succeed.
%
:- pred optional(parser(T)::in(parser), src::in, maybe(T)::out,
ps::in, ps::out) is semidet.
% optional(P, Src, Result, !S, !PS) returns Result = yes(X)
% if P(Src, X, !S, !PS), or Result = no if P does not succeed.
%
:- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in,
maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
% by repeatedly applying P until P fails. The nth item in Xs is
% the result from the nth application of P.
%
:- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out,
ps::in, ps::out) is semidet.
% zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
% by repeatedly applying P until P fails. The nth item in Xs is
% the result from the nth application of P.
%
:- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
% by repeatedly applying P until P fails. The nth item in Xs is
% the result from the nth application of P. P must succeed at least once.
%
:- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out,
ps::in, ps::out) is semidet.
% one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
% by repeatedly applying P until P fails. The nth item in Xs is
% the result from the nth application of P. P must succeed at least once.
%
:- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% brackets(L, R, P, Src, X, !PS) is equivalent to
% punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS).
%
:- pred brackets(string::in, string::in, parser(T)::in(parser), src::in,
T::out, ps::in, ps::out) is semidet.
% brackets(L, R, P, Src, X, !S, !PS) is equivalent to
% punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS).
%
:- pred brackets(string::in, string::in,
parser_with_state(T, S)::in(parser_with_state), src::in,
T::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% separated_list(Separator, P, Src, Xs, !PS) is like
% zero_or_more(P, Src, Xs, !PS) except that successive applications of
% P must be separated by punct(Separator, Src, _, !PS).
%
:- pred separated_list(string::in, parser(T)::in(parser), src::in,
list(T)::out, ps::in, ps::out) is semidet.
% separated_list(Separator, P, Src, Xs, !S, !PS) is like
% zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of
% P must be separated by punct(Separator, Src, _, !PS).
%
:- pred separated_list(string::in,
parser_with_state(T, S)::in(parser_with_state),
src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% comma_separated_list(P, Src, Xs) is the same as
% separated_list(",", P, Src, Xs).
%
:- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
ps::in, ps::out) is semidet.
% comma_separated_list(P, Src, Xs, !S, !PS) is the same as
% separated_list(",", P, Src, Xs, !S, !PS).
%
:- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state),
src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
%--------------------------------------------------%
% Declaratively this predicate is equivalent to false. Operationally,
% it will record an error message that will be returned by parse/4
% if no further progress is made and then fail.
%
:- pred fail_with_message(string::in, src::in, T::out, ps::in, ps::out)
is semidet.
% As above, but use the given offset for the context of the message.
%
:- pred fail_with_message(string::in, int::in, src::in, T::out,
ps::in, ps::out) is semidet.
%--------------------------------------------------%
%--------------------------------------------------%