mercury_term_lexer (The Mercury Library Reference Manual)

%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 1993-2000, 2003-2008, 2011-2012 The University of Melbourne.
% Copyright (C) 2014-2025 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: mercury_term_lexer.m.
% Main author: fjh.
% Stability: high.
%
% This module does lexical analysis of Mercury code. Its predicates analyze
% character sequences, and return the token sequences they contain, up to
% and including the token that ens a term, the period character.
% (If the input does not conform to Mercury's rules, then some of the
% returned tokens may be error indications.)
%
% This module exports predicates that do this lexical analysis both
% on characters read in from a stream, and on characters in a string
% (which may or may not represent the contents of a file).
%
%--------------------------------------------------%
%--------------------------------------------------%

:- module mercury_term_lexer.
:- interface.

:- import_module char.
:- import_module integer.
:- import_module io.

%--------------------------------------------------%

    % This is the type of the tokens that can appear in token lists.
    %
    % The "=< raw_token" is there because the actual implementation
    % of the lexer uses some internal-use-only kinds of tokens
    % that never appear in its output.
:- type token =< raw_token
    --->    name(string)
    ;       variable(string)
    ;       integer(integer_base, integer, signedness, integer_size)
    ;       float(float)
    ;       string(string)      % "...."
    ;       implementation_defined(string) % $name
    ;       open                % '('
    ;       open_ct             % '(' without any preceding whitespace
    ;       close               % ')'
    ;       open_list           % '['
    ;       close_list          % ']'
    ;       open_curly          % '{'
    ;       close_curly         % '}'
    ;       ht_sep              % '|' head-tail separator in lists
    ;       comma               % ','
    ;       end                 % '.'
    ;       junk(char)          % junk character in the input stream
    ;       error(string)       % some other invalid token
    ;       io_error(io.error). % error reading from the input stream

:- type integer_base
    --->    base_2
    ;       base_8
    ;       base_10
    ;       base_16.

:- type signedness
    --->    signed
    ;       unsigned.

:- type integer_size
    --->    size_word
    ;       size_8_bit
    ;       size_16_bit
    ;       size_32_bit
    ;       size_64_bit.

    % For every token, we record the line number of the line
    % on which the token occurred.
    %
:- type token_context == int.   % line number

    % This "fat list" representation is more efficient than a list of pairs.
    %
:- type token_list
    --->    token_cons(token, token_context, token_list)
    ;       token_nil.

    % A line_context and a line_posn together contain exactly the same
    % fields as a posn (which is defined in io.m), with the same semantics.
    % The difference is that stepping past a single character requires
    % no memory allocation whatsoever *unless* that character is a newline.
    %
    % XXX We should consider making both fields of line_context into uint32s,
    % to allow them to fit into a single 64 bit word. Simplicity would then
    % require line_posn's argument being a uint32 as well.

:- type line_context
    --->    line_context(
                line_context_current_line_number        :: int,
                line_context_offset_of_start_of_line    :: int
            ).

:- type line_posn
    --->    line_posn(
                line_posn_current_offset_in_file        :: int
            ).

    % Read a list of tokens either from the current input stream
    % or from the specified input stream.
    % Keep reading until we encounter either an `end' token
    % (i.e. a full stop followed by whitespace) or the end-of-file.
    %
    % See char.is_whitespace for the definition of whitespace characters
    % used by this predicate.
    %
:- pred get_token_list(token_list::out, io::di, io::uo) is det.
:- pred get_token_list(io.text_input_stream::in, token_list::out,
    io::di, io::uo) is det.

    % The type `offset' represents a (zero-based) offset into a string.
    %
:- type offset == int.

    % string_get_token_list_max(String, MaxOffset, Tokens,
    %   InitialPos, FinalPos):
    % linestr_get_token_list_max(String, MaxOffset, Tokens,
    %   InitialLineContext, FinalLineContext, InitialPos, FinalPos):
    %
    % Scan a list of tokens from a string, starting at the current offset
    % specified by InitialPos. Keep scanning until either we encounter either
    % an `end' token (i.e. a full stop followed by whitespace) or until we
    % reach MaxOffset. (MaxOffset must be =< the length of the string.)
    % Return the tokens scanned in Tokens, and return the position one
    % character past the end of the last token in FinalPos.
    %
    % See char.is_whitespace for the definition of whitespace characters
    % used by this predicate.
    %
:- pred string_get_token_list_max(string::in, offset::in, token_list::out,
    posn::in, posn::out) is det.
:- pred linestr_get_token_list_max(string::in, offset::in, token_list::out,
    line_context::in, line_context::out, line_posn::in, line_posn::out) is det.

    % string_get_token_list(String, Tokens, InitialPos, FinalPos):
    %
    % calls string_get_token_list_max above with MaxPos = length of String.
    %
:- pred string_get_token_list(string::in, token_list::out,
    posn::in, posn::out) is det.

    % Convert a token to a human-readable string describing the token.
    %
:- pred token_to_string(token::in, string::out) is det.

%--------------------------------------------------%
53 mercury_term_lexer