Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]
%--------------------------------------------------% % vim: ft=mercury ts=4 sw=4 et %--------------------------------------------------% % Copyright (C) 1993-2000, 2003-2008, 2011-2012 The University of Melbourne. % Copyright (C) 2014-2024 The Mercury team. % This file is distributed under the terms specified in COPYING.LIB. %--------------------------------------------------% % % File: lexer.m. % Main author: fjh. % Stability: high. % % This module does lexical analysis of Mercury code. Its predicates analyze % character sequences, and return the token sequences they contain, up to % and including the token that ens a term, the period character. % (If the input does not conform to Mercury's rules, then some of the % returned tokens may be error indications.) % % This module exports predicates that do this lexical analysis both % on characters read in from a stream, and on characters in a string % (which may or may not represent the contents of a file). % %--------------------------------------------------% %--------------------------------------------------% :- module mercury_term_lexer. :- interface. :- import_module char. :- import_module io. :- import_module integer. %--------------------------------------------------% :- type token ---> name(string) ; variable(string) ; integer(integer_base, integer, signedness, integer_size) ; float(float) ; string(string) % "...." ; implementation_defined(string) % $name ; open % '(' ; open_ct % '(' without any preceding whitespace ; close % ')' ; open_list % '[' ; close_list % ']' ; open_curly % '{' ; close_curly % '}' ; ht_sep % '|' ; comma % ',' ; end % '.' ; junk(char) % junk character in the input stream ; error(string) % some other invalid token ; io_error(io.error) % error reading from the input stream ; eof % end-of-file ; integer_dot(integer). % The lexer will never return integer_dot. This token is used % internally in the lexer, to keep the grammar LL(1) so that % only one character of pushback is needed. But the lexer will % convert integer_dot/1 tokens to integer/1 tokens before % returning them. :- type integer_base ---> base_2 ; base_8 ; base_10 ; base_16. :- type signedness ---> signed ; unsigned. :- type integer_size ---> size_word ; size_8_bit ; size_16_bit ; size_32_bit ; size_64_bit. % For every token, we record the line number of the line % on which the token occurred. % :- type token_context == int. % line number % This "fat list" representation is more efficient than a list of pairs. % :- type token_list ---> token_cons(token, token_context, token_list) ; token_nil. % A line_context and a line_posn together contain exactly the same % fields as a posn (which is defined in io.m), with the same semantics. % The difference is that stepping past a single character requires % no memory allocation whatsoever *unless* that character is a newline. % % XXX We should consider making both fields of line_context into uint32s, % to allow them to fit into a single 64 bit word. Simplicity would then % require line_posn's argument being a uint32 as well. :- type line_context ---> line_context( line_context_current_line_number :: int, line_context_offset_of_start_of_line :: int ). :- type line_posn ---> line_posn( line_posn_current_offset_in_file :: int ). % Read a list of tokens either from the current input stream % or from the specified input stream. % Keep reading until we encounter either an `end' token % (i.e. a full stop followed by whitespace) or the end-of-file. % % See char.is_whitespace for the definition of whitespace characters % used by this predicate. % :- pred get_token_list(token_list::out, io::di, io::uo) is det. :- pred get_token_list(io.text_input_stream::in, token_list::out, io::di, io::uo) is det. % The type `offset' represents a (zero-based) offset into a string. % :- type offset == int. % string_get_token_list_max(String, MaxOffset, Tokens, % InitialPos, FinalPos): % linestr_get_token_list_max(String, MaxOffset, Tokens, % InitialLineContext, FinalLineContext, InitialPos, FinalPos): % % Scan a list of tokens from a string, starting at the current offset % specified by InitialPos. Keep scanning until either we encounter either % an `end' token (i.e. a full stop followed by whitespace) or until we % reach MaxOffset. (MaxOffset must be =< the length of the string.) % Return the tokens scanned in Tokens, and return the position one % character past the end of the last token in FinalPos. % % See char.is_whitespace for the definition of whitespace characters % used by this predicate. % :- pred string_get_token_list_max(string::in, offset::in, token_list::out, posn::in, posn::out) is det. :- pred linestr_get_token_list_max(string::in, offset::in, token_list::out, line_context::in, line_context::out, line_posn::in, line_posn::out) is det. % string_get_token_list(String, Tokens, InitialPos, FinalPos): % % calls string_get_token_list_max above with MaxPos = length of String. % :- pred string_get_token_list(string::in, token_list::out, posn::in, posn::out) is det. % Convert a token to a human-readable string describing the token. % :- pred token_to_string(token::in, string::out) is det. %--------------------------------------------------% %--------------------------------------------------%
Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]