Merge branch 'natural-sort'

Add natural sort functions.

See https://github.com/wxWidgets/wxWidgets/pull/1923
This commit is contained in:
Vadim Zeitlin 2020-07-07 23:17:05 +02:00
commit 2289f8be55
4 changed files with 394 additions and 5 deletions

View File

@ -42,12 +42,30 @@ wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2)
return cmp ? cmp : s1.Cmp(s2);
}
inline int wxCMPFUNC_CONV
wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2)
{
return wxDictionaryStringSortAscending(s2, s1);
}
WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2)
{
return wxCmpNatural(s1, s2);
}
inline int wxCMPFUNC_CONV wxNaturalStringSortDescending(const wxString& s1, const wxString& s2)
{
return wxCmpNatural(s2, s1);
}
#if wxUSE_STD_CONTAINERS
typedef int (wxCMPFUNC_CONV *CMPFUNCwxString)(wxString*, wxString*);

View File

@ -363,7 +363,8 @@ public:
This function can be used with wxSortedArrayString::Sort() or passed as an
argument to wxSortedArrayString constructor.
@see wxStringSortDescending(), wxDictionaryStringSortAscending()
@see wxStringSortDescending(), wxDictionaryStringSortAscending(),
wxNaturalStringSortAscending()
@since 3.1.0
*/
@ -375,7 +376,8 @@ int wxStringSortAscending(const wxString& s1, const wxString& s2);
This function can be used with wxSortedArrayString::Sort() or passed as an
argument to wxSortedArrayString constructor.
@see wxStringSortAscending(), wxDictionaryStringSortAscending()
@see wxStringSortAscending(), wxDictionaryStringSortDescending(),
wxNaturalStringSortDescending()
@since 3.1.0
*/
@ -392,7 +394,9 @@ int wxStringSortDescending(const wxString& s1, const wxString& s2);
This function can be used with wxSortedArrayString::Sort() or passed as an
argument to wxSortedArrayString constructor.
@see wxStringSortAscending(), wxDictionaryStringSortDescending()
@see wxDictionaryStringSortDescending(),
wxStringSortAscending(),
wxNaturalStringSortAscending()
@since 3.1.0
*/
@ -403,11 +407,94 @@ int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2);
See wxDictionaryStringSortAscending() for the dictionary sort description.
@see wxStringSortDescending()
@see wxDictionaryStringSortAscending(),
wxStringSortDescending(),
wxNaturalStringSortDescending()
@since 3.1.0
*/
int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2);
int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in natural order.
This function can be used with wxSortedArrayString::Sort()
or passed as an argument to wxSortedArrayString constructor.
See wxCmpNatural() for more information about how natural
sort order is implemented.
@see wxNaturalStringSortDescending(),
wxStringSortAscending(), wxDictionaryStringSortAscending()
@since 3.1.4
*/
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in reverse natural order.
This function can be used with wxSortedArrayString::Sort()
or passed as an argument to wxSortedArrayString constructor.
See wxCmpNatural() for more information about how natural
sort order is implemented.
@see wxNaturalStringSortAscending(),
wxStringSortDescending(), wxDictionaryStringSortDescending()
@since 3.1.4
*/
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
/**
This function compares strings using case-insensitive collation and
additionally, numbers within strings are recognised and compared
numerically, rather than alphabetically. When used for sorting,
the result is that e.g. file names containing numbers are sorted
in a natural way.
For example, sorting with a simple string comparison results in:
- file1.txt
- file10.txt
- file100.txt
- file2.txt
- file20.txt
- file3.txt
But sorting the same strings in natural sort order results in:
- file1.txt
- file2.txt
- file3.txt
- file10.txt
- file20.txt
- file100.txt
wxCmpNatural() uses an OS native natural sort function when available
(currently only under Microsoft Windows), wxCmpNaturalGeneric() otherwise.
Be aware that OS native implementations might differ from each other,
and might change behaviour from release to release.
@see wxNaturalStringSortAscending(), wxNaturalStringSortDescending()
@since 3.1.4
*/
int wxCmpNatural(const wxString& s1, const wxString& s2);
/**
This is wxWidgets' own implementation of the natural sort comparison function.
Requires wxRegEx, if it is unavailable numbers within strings are not
recognised and only case-insensitive collation is performed.
@see wxCmpNatural()
@since 3.1.4
*/
int wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
// ============================================================================
// Global functions/macros

View File

@ -20,13 +20,20 @@
#endif
#include "wx/arrstr.h"
#include "wx/regex.h"
#include "wx/scopedarray.h"
#include "wx/wxcrt.h"
#include "wx/beforestd.h"
#include <algorithm>
#include <functional>
#include "wx/afterstd.h"
#if defined( __WINDOWS__ )
#include <shlwapi.h>
#endif
// ============================================================================
// ArrayString
// ============================================================================
@ -721,3 +728,199 @@ wxArrayString wxSplit(const wxString& str, const wxChar sep, const wxChar escape
return ret;
}
#if wxUSE_REGEX
namespace // helpers needed by wxCmpNaturalGeneric()
{
// Used for comparison of string parts
struct wxStringFragment
{
// Fragment types are generally sorted like this:
// Empty < SpaceOrPunct < Digit < LetterOrSymbol
// Fragments of the same type are compared as follows:
// SpaceOrPunct - collated, Digit - as numbers using value
// LetterOrSymbol - lower-cased and then collated
enum Type
{
Empty,
SpaceOrPunct, // whitespace or punctuation
Digit, // a sequence of decimal digits
LetterOrSymbol // letters and symbols, i.e., anything not covered by the above types
};
wxStringFragment() : type(Empty), value(0) {}
Type type;
wxString text;
wxUint64 value; // used only for Digit type
};
wxStringFragment GetFragment(wxString& text)
{
static const wxRegEx reSpaceOrPunct(wxS("^([[:space:]]|[[:punct:]])+"));
// Limit the length to make sure the value will fit into a wxUint64
static const wxRegEx reDigit(wxS("^[[:digit:]]{1,19}"));
static const wxRegEx reLetterOrSymbol("^[^[:space:]|[:punct:]|[:digit:]]+");
if ( text.empty() )
return wxStringFragment();
wxStringFragment fragment;
size_t length = 0;
// In attempt to minimize the number of wxRegEx.Matches() calls,
// try to do them from the most expected to the least expected
// string fragment type.
if ( reLetterOrSymbol.Matches(text) )
{
if ( reLetterOrSymbol.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::LetterOrSymbol;
fragment.text = text.Left(length);
}
}
else if ( reDigit.Matches(text) )
{
if ( reDigit.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::Digit;
fragment.text = text.Left(length);
fragment.text.ToULongLong(&fragment.value);
}
}
else if ( reSpaceOrPunct.Matches(text) )
{
if ( reSpaceOrPunct.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::SpaceOrPunct;
fragment.text = text.Left(length);
}
}
text.erase(0, length);
return fragment;
}
int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs)
{
switch ( lhs.type )
{
case wxStringFragment::Empty:
switch ( rhs.type )
{
case wxStringFragment::Empty:
return 0;
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::SpaceOrPunct:
switch ( rhs.type )
{
case wxStringFragment::Empty:
return 1;
case wxStringFragment::SpaceOrPunct:
return wxStrcoll_String(lhs.text, rhs.text);
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::Digit:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
return 1;
case wxStringFragment::Digit:
if ( lhs.value > rhs.value )
return 1;
else if ( lhs.value < rhs.value )
return -1;
else
return 0;
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::LetterOrSymbol:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
return 1;
case wxStringFragment::LetterOrSymbol:
return wxStrcoll_String(lhs.text.Lower(), rhs.text.Lower());
}
}
// all possible cases should be covered by the switch above
// but return also from here to prevent the compiler warning
return 1;
}
} // unnamed namespace
// ----------------------------------------------------------------------------
// wxCmpNaturalGeneric
// ----------------------------------------------------------------------------
//
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
wxString lhs(s1);
wxString rhs(s2);
int comparison = 0;
while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) )
{
const wxStringFragment fragmentLHS = GetFragment(lhs);
const wxStringFragment fragmentRHS = GetFragment(rhs);
comparison = CompareFragmentNatural(fragmentLHS, fragmentRHS);
}
return comparison;
}
#else
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
return wxStrcoll_String(s1.Lower(), s2.Lower());
}
#endif // #if wxUSE_REGEX
// ----------------------------------------------------------------------------
// Declaration of StrCmpLogicalW()
// ----------------------------------------------------------------------------
//
// In some distributions of MinGW32, this function is exported in the library,
// but not declared in shlwapi.h. Therefore we declare it here.
#if defined( __MINGW32_TOOLCHAIN__ )
extern "C" __declspec(dllimport) int WINAPI StrCmpLogicalW(LPCWSTR psz1, LPCWSTR psz2);
#endif
// ----------------------------------------------------------------------------
// wxCmpNatural
// ----------------------------------------------------------------------------
//
// If a native version of Natural sort is available, then use that, otherwise
// use the generic version.
inline int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
{
#if defined( __WINDOWS__ )
return StrCmpLogicalW(s1.wc_str(), s2.wc_str());
#else
return wxCmpNaturalGeneric(s1, s2);
#endif // #if defined( __WINDOWS__ )
}

View File

@ -780,3 +780,84 @@ void ArraysTestCase::IndexFromEnd()
CPPUNIT_ASSERT_EQUAL( 1, a.Index(1, /*bFromEnd=*/true) );
CPPUNIT_ASSERT_EQUAL( 2, a.Index(42, /*bFromEnd=*/true) );
}
TEST_CASE("wxNaturalStringComparisonGeneric()", "[wxString][compare]")
{
#if !wxUSE_REGEX
WARN("Skipping wxCmpNaturalGeneric() tests: wxRegEx not available");
#else
// simple string comparison
CHECK(wxCmpNaturalGeneric("a", "a") == 0);
CHECK(wxCmpNaturalGeneric("a", "z") < 0);
CHECK(wxCmpNaturalGeneric("z", "a") > 0);
// case insensitivity
CHECK(wxCmpNaturalGeneric("a", "A") == 0);
CHECK(wxCmpNaturalGeneric("A", "a") == 0);
CHECK(wxCmpNaturalGeneric("AB", "a") > 0);
CHECK(wxCmpNaturalGeneric("a", "AB") < 0);
// empty strings sort before whitespace and punctiation
CHECK(wxCmpNaturalGeneric("", " ") < 0);
CHECK(wxCmpNaturalGeneric(" ", "") > 0);
CHECK(wxCmpNaturalGeneric("", ",") < 0);
CHECK(wxCmpNaturalGeneric(",", "") > 0);
// empty strings sort before numbers
CHECK(wxCmpNaturalGeneric("", "0") < 0);
CHECK(wxCmpNaturalGeneric("0", "") > 0);
// empty strings sort before letters and symbols
CHECK(wxCmpNaturalGeneric("", "abc") < 0);
CHECK(wxCmpNaturalGeneric("abc", "") > 0);
// whitespace and punctiation sort before numbers
CHECK(wxCmpNaturalGeneric(" ", "1") < 0);
CHECK(wxCmpNaturalGeneric("1", " ") > 0);
CHECK(wxCmpNaturalGeneric(",", "1") < 0);
CHECK(wxCmpNaturalGeneric("1", ",") > 0);
// strings containing numbers sort before letters and symbols
CHECK(wxCmpNaturalGeneric("00", "a") < 0);
CHECK(wxCmpNaturalGeneric("a", "00") > 0);
// strings containing numbers are compared by their value
CHECK(wxCmpNaturalGeneric("01", "1") == 0);
CHECK(wxCmpNaturalGeneric("1", "01") == 0);
CHECK(wxCmpNaturalGeneric("1", "05") < 0);
CHECK(wxCmpNaturalGeneric("05", "1") > 0);
CHECK(wxCmpNaturalGeneric("10", "5") > 0);
CHECK(wxCmpNaturalGeneric("5", "10") < 0);
CHECK(wxCmpNaturalGeneric("1", "9999999999999999999") < 0);
CHECK(wxCmpNaturalGeneric("9999999999999999999", "1") > 0);
// comparing strings composed from whitespace,
// punctuation, numbers, letters, and symbols
CHECK(wxCmpNaturalGeneric("1st", " 1st") > 0);
CHECK(wxCmpNaturalGeneric(" 1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", ",1st") > 0);
CHECK(wxCmpNaturalGeneric(",1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", "01st") == 0);
CHECK(wxCmpNaturalGeneric("01st", "1st") == 0);
CHECK(wxCmpNaturalGeneric("10th", "5th") > 0);
CHECK(wxCmpNaturalGeneric("5th", "10th") < 0);
CHECK(wxCmpNaturalGeneric("a1st", "a01st") == 0);
CHECK(wxCmpNaturalGeneric("a01st", "a1st") == 0);
CHECK(wxCmpNaturalGeneric("a10th", "a5th") > 0);
CHECK(wxCmpNaturalGeneric("a5th", "a10th") < 0);
CHECK(wxCmpNaturalGeneric("a 10th", "a5th") < 0);
CHECK(wxCmpNaturalGeneric("a5th", "a 10th") > 0);
CHECK(wxCmpNaturalGeneric("a1st1", "a01st01") == 0);
CHECK(wxCmpNaturalGeneric("a01st01", "a1st1") == 0);
CHECK(wxCmpNaturalGeneric("a10th10", "a5th5") > 0);
CHECK(wxCmpNaturalGeneric("a5th5", "a10th10") < 0);
CHECK(wxCmpNaturalGeneric("a 10th 10", "a5th 5") < 0);
CHECK(wxCmpNaturalGeneric("a5th 5", "a 10th 10") > 0);
#endif // #if !wxUSE_REGEX
}