%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 2023-2025 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: edit_distance.m.
% Stability: high.
%
% This module computes the edit distance between two sequences of items.
% Its job is both similar to, and distinct from, the job of edit_seq.m.
% It is similar in that both modules work out the simplest, cheapest way
% to transform one sequence into another. It is distinct because the two
% modules aim to solve different problems.
%
% - edit_seq.m aims to solve the problem of displaying the difference
% between two given sequences in a way that makes their differences
% as easy to understand as possible.
%
% - edit_distance.m aims to solve the problem of finding, in a pool of
% candidate sequences, the candidate that is closest to a given query
% sequence.
%
% Doing a good job with the second problem requires a mechanism that
% allows callers to specify that a transposition of two elements (such as
% replacing "bc" with "cb", thus transforming e.g. "abcd" into "acbd")
% has a different cost than deleting one element in one place in the sequence
% and inserting it back at another place. This mechanism does not help
% with the first problem at all (since the simplest way to display
% a transposition *is* as a delete/insert pair), and in fact its presence
% in the system would unnecessarily complicate the algorithm.
%
% Technically, this module computes Damerau-Levenshtein distances,
% while edit_seq.m computes Levenshtein distances. (The difference between
% the two is that only the former considers transpositions to be single
% operations.).
%
%--------------------------------------------------%
%--------------------------------------------------%
:- module edit_distance.
:- interface.
:- import_module char.
:- import_module list.
%--------------------------------------------------%
%--------------------------------------------------%
:- type edit_params(T)
---> edit_params(
% The cost of delete, insert, replace and transpose operations
% respectively. Only the *relative* values of the costs matter;
% if these are fixed, their *absolute* values are irrelevant
% (unless they are so high that they cause arithmetic
% overflows).
%
% For replacement operations, the cost may depend on what
% is being replaced by what. The intended use case is
% specifying that replacements that change a letter into
% the same letter in a different case (as char.to_lower
% or char.to_upper would do) are cheaper than other kinds of
% replacements.
cost_of_delete :: uint,
cost_of_insert :: uint,
cost_of_replace :: (func(T, T) = uint),
cost_of_transpose :: uint
).
% find_edit_distance(Params, SeqA, SeqB, Distance):
%
% Compute Distance as the sum of the costs of the operations
% needed to transform SeqA into SeqB, where the cost of each kind of
% edit operation is specified by Params.
%
:- pred find_edit_distance(edit_params(T)::in, list(T)::in, list(T)::in,
uint::out) is det.
%--------------------------------------------------%
% find_closest_seqs(Params, SourceSeq, TargetSeqs,
% BestEditDistance, HeadBestCloseSeq, TailBestCloseSeqs):
%
% Given a source sequence SourceSeq, find the sequence in TargetSeqs
% that has the best (meaning smallest) edit distance from SourceSeq.
% Return that target sequence as HeadBestCloseSeq, and its edit distance
% in BestEditDistance. If there are any other sequences in TargetSeqs
% that also have the same edit distance from SourceSeq, return them
% in TailBestCloseSeqs. [HeadBestCloseSeq | TailBestCloseSeqs] will contain
% target sequences in the same order as TargetSeqs.
%
% Note that TargetSeqs must be a nonempty list, i.e. it cannot be [].
% However, it is ok for one of its elements to be an empty sequence.
%
:- pred find_closest_seqs(edit_params(T)::in, list(T)::in, list(list(T))::in,
uint::out, list(T)::out, list(list(T))::out) is det.
% find_best_close_enough_seqs(Params, SourceSeq, TargetSeqs,
% MaxEditDistance, BestEditDistance,
% HeadBestCloseSeq, TailBestCloseSeqs):
%
% This is a version of find_closest_seqs that takes MaxEditDistance
% as an input, and which ignores any target sequence whose edit distance
% from SourceSeq is known to exceed MaxEditDistance.
%
% If all elements of TargetSeqs have edit distances from SourceSeq
% that are greater than MaxEditDistance, then this predicate will fail.
% If this is not the case, i.e. if there are some target sequences
% whose distance is less than or equal to MaxEditDistance, this predicate
% will succeed, and will do the same job as find_closest_seqs, but it is
% intended to do so more quickly, the efficiency gain coming from stopping
% the computation of exact edit distances as soon as it becomes known
% that the exact distance would exceed MaxEditDistance.
%
:- pred find_best_close_enough_seqs(edit_params(T)::in, list(T)::in,
list(list(T))::in, uint::in, uint::out,
list(T)::out, list(list(T))::out) is semidet.
%--------------------------------------------------%
% find_closest_strings(Params, SourceStr, TargetStrs,
% BestEditDistance, HeadBestCloseStr, TailBestCloseStrs):
%
% This is an instance of find_closest_seqs that takes care of the
% necessary conversions between strings and sequences of characters.
%
:- pred find_closest_strings(edit_params(char)::in, string::in,
list(string)::in, uint::out, string::out, list(string)::out) is det.
% find_best_close_enough_strings(Params, SourceStr, TargetStrs,
% MaxEditDistance,
% BestEditDistance, HeadBestCloseStr, TailBestCloseStrs):
%
% This is an instance of find_best_close_enough_seqs that takes care of the
% necessary conversions between strings and sequences of characters.
%
:- pred find_best_close_enough_strings(edit_params(char)::in, string::in,
list(string)::in, uint::in, uint::out, string::out, list(string)::out)
is semidet.
%--------------------------------------------------%
%--------------------------------------------------%