The Mercury Library Reference Manual: char

%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 1994-2008, 2011 The University of Melbourne.
% Copyright (C) 2013-2015, 2017-2021 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: char.m.
% Main author: fjh.
% Stability: high.
%
% This module defines some predicates that manipulate characters.
%
% Originally we used `character' rather than `char' for the type name
% because `char' was used by NU-Prolog to mean something different.
% But now we use `char' and the use of `character' is discouraged.
%
% All predicates and functions exported by this module that deal with
% Unicode conform to version 13 of the Unicode standard.
%
%--------------------------------------------------%
%--------------------------------------------------%

:- module char.
:- interface.

:- import_module enum.
:- import_module list.
:- import_module pretty_printer.

%--------------------------------------------------%

    % A Unicode code point.
    %
:- type char == character.

:- instance enum(character).

    % `to_int'/1 and `to_int(in, out)' convert a character to its
    % corresponding numerical code (integer value).
    %
    % `to_int(out, in)' converts an integer value to a character value.
    % It fails for integer values outside of the Unicode range.
    %
    % Be aware that there is no guarantee that characters can be written to
    % files or to the standard output or standard error streams. Files using an
    % 8-bit national character set would only be able to represent a subset of
    % all possible code points. Currently, the Mercury standard library can
    % only read and write UTF-8 text files, so the entire range is supported
    % (excluding surrogate and noncharacter code points).
    %
    % Note that '\0' is not accepted as a Mercury null character literal.
    % Instead, a null character can be created using `det_from_int(0)'.
    % Null characters are not allowed in Mercury strings in C grades.
    %
:- func to_int(char) = int.
:- pred to_int(char, int).
:- mode to_int(in, out) is det.
:- mode to_int(in, in) is semidet.    % implied
:- mode to_int(out, in) is semidet.

    % Converts an integer to its corresponding character, if any.
    % A more expressive name for the reverse mode of to_int.
    %
:- pred from_int(int::in, char::out) is semidet.

    % Converts an integer to its corresponding character.
    % Throws an exception if there isn't one.
    %
:- func det_from_int(int) = char.
:- pred det_from_int(int::in, char::out) is det.

    % Returns the minimum numerical character code.
    %
:- func min_char_value = int.
:- pred min_char_value(int::out) is det.

    % Returns the maximum numerical character code.
    %
:- func max_char_value = int.
:- pred max_char_value(int::out) is det.

%--------------------------------------------------%

    % True iff the character is a lowercase letter (a-z) in the ASCII range.
    %
:- pred is_lower(char::in) is semidet.

    % True iff the character is an uppercase letter (A-Z) in the ASCII range.
    %
:- pred is_upper(char::in) is semidet.

    % Convert a character to lowercase.
    % Note that this only converts letters (A-Z) in the ASCII range.
    %
:- func to_lower(char) = char.
:- pred to_lower(char::in, char::out) is det.

    % Convert a character to uppercase.
    % Note that this only converts letters (a-z) in the ASCII range.
    %
:- func to_upper(char) = char.
:- pred to_upper(char::in, char::out) is det.

    % lower_upper(Lower, Upper) is true iff
    % Lower is a lowercase letter (a-z) and Upper is the corresponding
    % uppercase letter (A-Z) in the ASCII range.
    %
:- pred lower_upper(char, char).
:- mode lower_upper(in, out) is semidet.
:- mode lower_upper(out, in) is semidet.

%--------------------------------------------------%

    % True iff the character is in the ASCII range (0-127).
    %
:- pred is_ascii(char::in) is semidet.

    % True iff the character is a whitespace character in the ASCII range:
    %
    %   U+0020  space
    %   U+0009  character tabulation (horizontal tab)
    %   U+000A  line feed
    %   U+000B  line tabulation (vertical tab)
    %   U+000C  form feed
    %   U+000D  carriage return
    %
:- pred is_whitespace(char::in) is semidet.

    % True iff the character is a letter (A-Z, a-z) in the ASCII range.
    %
:- pred is_alpha(char::in) is semidet.

    % True iff the character is a letter (A-Z, a-z) or digit (0-9)
    % in the ASCII range.
    %
:- pred is_alnum(char::in) is semidet.

    % True iff the character is a letter (A-Z, a-z) or an underscore (_)
    % in the ASCII range.
    %
:- pred is_alpha_or_underscore(char::in) is semidet.

    % True iff the character is a letter (A-Z, a-z), a digit (0-9) or an
    % underscore (_) in the ASCII range.
    %
:- pred is_alnum_or_underscore(char::in) is semidet.

%--------------------------------------------------%

    % True iff the character is a decimal digit (0-9) in the ASCII range.
    %
:- pred is_digit(char::in) is semidet.

    % True iff the character is a binary digit (0 or 1) in the ASCII range.
    %
:- pred is_binary_digit(char::in) is semidet.

    % True iff the character is an octal digit (0-7) in the ASCII range.
    %
:- pred is_octal_digit(char::in) is semidet.

    % True iff the character is a decimal digit (0-9) in the ASCII range.
    % Synonym for is_digit/1.
    %
:- pred is_decimal_digit(char::in) is semidet.

    % True iff the character is a hexadecimal digit (0-9, a-f, A-F) in the
    % ASCII range.
    %
:- pred is_hex_digit(char::in) is semidet.

    % is_base_digit(Base, Digit):
    % True iff Digit is a digit in the given Base (0-9, a-z, A-Z).
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred is_base_digit(int::in, char::in) is semidet.

%--------------------------------------------------%

    % binary_digit_to_int(Char, Int):
    % True iff Char is a binary digit (0-1) representing the value Int.
    %
:- pred binary_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_binary_digit_to_int(char) = int.

    % octal_digit_to_int(Char, Int):
    % True iff Char is an octal digit (0-7) representing the value Int.
    %
:- pred octal_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_octal_digit_to_int(char) = int.

    % decimal_digit_to_int(Char, Int):
    % True iff Char is a decimal digit (0-9) representing the value Int.
    %
:- pred decimal_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_decimal_digit_to_int(char) = int.

    % hex_digit_to_int(Char, Int):
    % True iff Char is a hexadecimal digit (0-9, a-z or A-F) representing the
    % value Int.
    %
:- pred hex_digit_to_int(char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_hex_digit_to_int(char) = int.

    % base_digit_to_int(Base, Char, Int):
    % True iff Char is a decimal digit (0-9) or a letter (a-z, A-Z)
    % representing the value Int (0-35) in the given base.
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred base_digit_to_int(int::in, char::in, int::out) is semidet.

    % As above, but throws an exception instead of failing.
    %
:- func det_base_digit_to_int(int, char) = int.

    % A version of base_digit_to_int that does not check whether
    % Base is in the range 2 to 36. If it is not, the behavior is undefined.
    %
:- pred unsafe_base_digit_to_int(int::in, char::in, int::out) is semidet.

%--------------------------------------------------%

    % Convert an integer in the range 0-1 to a binary digit (0 or 1) in the
    % ASCII range.
    %
:- pred int_to_binary_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_binary_digit(int) = char.

    % Convert an integer 0-7 to an octal digit (0-7) in the ASCII range.
    %
:- pred int_to_octal_digit(int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_int_to_octal_digit(int) = char.

    % Convert an integer 0-9 to a decimal digit (0-9) in the ASCII range.
    %
:- pred int_to_decimal_digit(int::in, char::out) is semidet.

    % As above, but throw an exception in instead of failing.
    %
:- func det_int_to_decimal_digit(int) = char.

    % Convert an integer 0-15 to an uppercase hexadecimal digit (0-9, A-F) in
    % the ASCII range.
    %
:- pred int_to_hex_digit(int::in, char::out) is semidet.

    % As above, but throw an exception in instead of failing.
    %
:- func det_int_to_hex_digit(int) = char.

    % base_int_to_digit(Base, Int, Char):
    % True iff Char is a decimal digit (0-9) or an uppercase letter (A-Z)
    % representing the value Int (0-35) in the given base.
    % Throws an exception if Base < 2 or Base > 36.
    %
:- pred base_int_to_digit(int::in, int::in, char::out) is semidet.

    % As above, but throw an exception instead of failing.
    %
:- func det_base_int_to_digit(int, int) = char.

%--------------------------------------------------%

    % Encode a Unicode code point in UTF-8.
    % Fails for surrogate code points.
    %
:- pred to_utf8(char::in, list(int)::out) is semidet.

    % As above, but represent UTF-8 code units using uint8s.
    %
:- pred to_utf8_uint8(char::in, list(uint8)::out) is semidet.

    % Encode a Unicode code point in UTF-16 (native endianness).
    % Fails for surrogate code points.
    %
:- pred to_utf16(char::in, list(int)::out) is semidet.

    % As above, but represent UTF-16 code units using uint16s.
    %
:- pred to_utf16_uint16(char::in, list(uint16)::out) is semidet.

    % True iff the character is a Unicode Surrogate code point, that is a code
    % point in General Category `Other,surrogate' (`Cs').
    % In UTF-16, a code point with a scalar value greater than 0xffff is
    % encoded with a pair of surrogate code points.
    %
:- pred is_surrogate(char::in) is semidet.

    % True iff the character is a Unicode leading surrogate code point.
    % A leading surrogate code point is in the inclusive range from
    % 0xd800 to 0xdbff.
    %
:- pred is_leading_surrogate(char::in) is semidet.

    % True iff the character is a Unicode trailing surrogate code point.
    % A trailing surrogate code point is in the inclusive range from
    % 0xdc00 to 0xdfff.
    %
:- pred is_trailing_surrogate(char::in) is semidet.

    % True iff the character is a Unicode Noncharacter code point.
    % Sixty-six code points are not used to encode characters.
    % These code points should not be used for interchange, but may be used
    % internally.
    %
:- pred is_noncharacter(char::in) is semidet.

    % True iff the character is a Unicode Control code point, that is a code
    % point in General Category `Other,control' (`Cc').
    %
:- pred is_control(char::in) is semidet.

    % True iff the character is a Unicode Space Separator code point, that is a
    % code point in General Category `Separator,space' (`Zs').
    %
:- pred is_space_separator(char::in) is semidet.

    % True iff the character is a Unicode Line Separator code point, that is a
    % code point in General Category `Separator,line' (`Zl').
    %
:- pred is_line_separator(char::in) is semidet.

    % True iff the character is a Unicode Paragraph Separator code point, that
    % is a code point in General Category `Separator,paragraph' (`Zp').
    %
:- pred is_paragraph_separator(char::in) is semidet.

    % True iff the character is a Unicode Private-use code point, that is a
    % code point in General Category `Other,private use' (`Co').
    %
:- pred is_private_use(char::in) is semidet.

%--------------------------------------------------%

    % Convert a char to a pretty_printer.doc for formatting.
    %
:- func char_to_doc(char) = pretty_printer.doc.

%--------------------------------------------------%

% The following have all been deprecated.

    % Use hex_digit_to_int/2 instead.
    %
:- pred is_hex_digit(char, int).
:- mode is_hex_digit(in, out) is semidet.

    % Convert an integer 0-15 to a hexadecimal digit (0-9, A-F) in the ASCII
    % range.
    %
    % Use int_to_hex_digit/2 instead.
    %
:- pred int_to_hex_char(int, char).
:- mode int_to_hex_char(in, out) is semidet.

%--------------------------------------------------%
%
% Computing hashes of chars.
%

    % Compute a hash value for a char.
    %
:- func hash(char) = int.
:- pred hash(char::in, int::out) is det.

%--------------------------------------------------%
%--------------------------------------------------%
15 char