Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]
%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 1993-2000, 2003-2008, 2011-2012 The University of Melbourne.
% Copyright (C) 2014-2025 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: mercury_term_lexer.m.
% Main author: fjh.
% Stability: high.
%
% This module does lexical analysis of Mercury code. Its predicates analyze
% character sequences, and return the token sequences they contain, up to
% and including the token that ens a term, the period character.
% (If the input does not conform to Mercury's rules, then some of the
% returned tokens may be error indications.)
%
% This module exports predicates that do this lexical analysis both
% on characters read in from a stream, and on characters in a string
% (which may or may not represent the contents of a file).
%
%--------------------------------------------------%
%--------------------------------------------------%
:- module mercury_term_lexer.
:- interface.
:- import_module char.
:- import_module integer.
:- import_module io.
%--------------------------------------------------%
% This is the type of the tokens that can appear in token lists.
%
% The "=< raw_token" is there because the actual implementation
% of the lexer uses some internal-use-only kinds of tokens
% that never appear in its output.
:- type token =< raw_token
---> name(string)
; variable(string)
; integer(integer_base, integer, signedness, integer_size)
; float(float)
; string(string) % "...."
; implementation_defined(string) % $name
; open % '('
; open_ct % '(' without any preceding whitespace
; close % ')'
; open_list % '['
; close_list % ']'
; open_curly % '{'
; close_curly % '}'
; ht_sep % '|' head-tail separator in lists
; comma % ','
; end % '.'
; junk(char) % junk character in the input stream
; error(string) % some other invalid token
; io_error(io.error). % error reading from the input stream
:- type integer_base
---> base_2
; base_8
; base_10
; base_16.
:- type signedness
---> signed
; unsigned.
:- type integer_size
---> size_word
; size_8_bit
; size_16_bit
; size_32_bit
; size_64_bit.
% For every token, we record the line number of the line
% on which the token occurred.
%
:- type token_context == int. % line number
% This "fat list" representation is more efficient than a list of pairs.
%
:- type token_list
---> token_cons(token, token_context, token_list)
; token_nil.
% A line_context and a line_posn together contain exactly the same
% fields as a posn (which is defined in io.m), with the same semantics.
% The difference is that stepping past a single character requires
% no memory allocation whatsoever *unless* that character is a newline.
%
% XXX We should consider making both fields of line_context into uint32s,
% to allow them to fit into a single 64 bit word. Simplicity would then
% require line_posn's argument being a uint32 as well.
:- type line_context
---> line_context(
line_context_current_line_number :: int,
line_context_offset_of_start_of_line :: int
).
:- type line_posn
---> line_posn(
line_posn_current_offset_in_file :: int
).
% Read a list of tokens either from the current input stream
% or from the specified input stream.
% Keep reading until we encounter either an `end' token
% (i.e. a full stop followed by whitespace) or the end-of-file.
%
% See char.is_whitespace for the definition of whitespace characters
% used by this predicate.
%
:- pred get_token_list(token_list::out, io::di, io::uo) is det.
:- pred get_token_list(io.text_input_stream::in, token_list::out,
io::di, io::uo) is det.
% The type `offset' represents a (zero-based) offset into a string.
%
:- type offset == int.
% string_get_token_list_max(String, MaxOffset, Tokens,
% InitialPos, FinalPos):
% linestr_get_token_list_max(String, MaxOffset, Tokens,
% InitialLineContext, FinalLineContext, InitialPos, FinalPos):
%
% Scan a list of tokens from a string, starting at the current offset
% specified by InitialPos. Keep scanning until either we encounter either
% an `end' token (i.e. a full stop followed by whitespace) or until we
% reach MaxOffset. (MaxOffset must be =< the length of the string.)
% Return the tokens scanned in Tokens, and return the position one
% character past the end of the last token in FinalPos.
%
% See char.is_whitespace for the definition of whitespace characters
% used by this predicate.
%
:- pred string_get_token_list_max(string::in, offset::in, token_list::out,
posn::in, posn::out) is det.
:- pred linestr_get_token_list_max(string::in, offset::in, token_list::out,
line_context::in, line_context::out, line_posn::in, line_posn::out) is det.
% string_get_token_list(String, Tokens, InitialPos, FinalPos):
%
% calls string_get_token_list_max above with MaxPos = length of String.
%
:- pred string_get_token_list(string::in, token_list::out,
posn::in, posn::out) is det.
% Convert a token to a human-readable string describing the token.
%
:- pred token_to_string(token::in, string::out) is det.
%--------------------------------------------------%
Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]