2007-04-19 06:05:55 -04:00
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Name: wx/stringops.h
|
|
|
|
// Purpose: implementation of wxString primitive operations
|
|
|
|
// Author: Vaclav Slavik
|
|
|
|
// Modified by:
|
|
|
|
// Created: 2007-04-16
|
|
|
|
// RCS-ID: $Id$
|
|
|
|
// Copyright: (c) 2007 REA Elektronik GmbH
|
|
|
|
// Licence: wxWindows licence
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#ifndef _WX_WXSTRINGOPS_H__
|
|
|
|
#define _WX_WXSTRINGOPS_H__
|
|
|
|
|
|
|
|
#include "wx/chartype.h"
|
|
|
|
#include "wx/stringimpl.h"
|
|
|
|
#include "wx/unichar.h"
|
2007-04-28 03:56:45 -04:00
|
|
|
#include "wx/buffer.h"
|
2007-04-19 06:05:55 -04:00
|
|
|
|
|
|
|
// This header contains wxStringOperations "namespace" class that implements
|
|
|
|
// elementary operations on string data as static methods; wxString methods and
|
|
|
|
// iterators are implemented in terms of it. Two implementations are available,
|
|
|
|
// one for UTF-8 encoded char* string and one for "raw" wchar_t* strings (or
|
|
|
|
// char* in ANSI build).
|
|
|
|
|
|
|
|
// FIXME-UTF8: only wchar after we remove ANSI build
|
|
|
|
#if wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
|
|
|
|
struct WXDLLIMPEXP_BASE wxStringOperationsWchar
|
|
|
|
{
|
|
|
|
// moves the iterator to the next Unicode character
|
|
|
|
static void IncIter(wxStringImpl::iterator& i) { ++i; }
|
|
|
|
static void IncIter(wxStringImpl::const_iterator& i) { ++i; }
|
|
|
|
|
|
|
|
// moves the iterator to the previous Unicode character
|
|
|
|
static void DecIter(wxStringImpl::iterator& i) { --i; }
|
|
|
|
static void DecIter(wxStringImpl::const_iterator& i) { --i; }
|
|
|
|
|
|
|
|
// moves the iterator by n Unicode characters
|
2008-01-06 17:27:01 -05:00
|
|
|
static wxStringImpl::iterator AddToIter(const wxStringImpl::iterator& i, ptrdiff_t n)
|
2007-04-19 06:05:55 -04:00
|
|
|
{ return i + n; }
|
2008-01-06 17:27:01 -05:00
|
|
|
static wxStringImpl::const_iterator AddToIter(const wxStringImpl::const_iterator& i, ptrdiff_t n)
|
2007-04-19 06:05:55 -04:00
|
|
|
{ return i + n; }
|
|
|
|
|
|
|
|
// returns distance of the two iterators in Unicode characters
|
2008-01-06 17:27:01 -05:00
|
|
|
static ptrdiff_t DiffIters(const wxStringImpl::iterator& i1,
|
|
|
|
const wxStringImpl::iterator& i2)
|
2007-04-19 06:05:55 -04:00
|
|
|
{ return i1 - i2; }
|
2008-01-06 17:27:01 -05:00
|
|
|
static ptrdiff_t DiffIters(const wxStringImpl::const_iterator& i1,
|
|
|
|
const wxStringImpl::const_iterator& i2)
|
2007-04-19 06:05:55 -04:00
|
|
|
{ return i1 - i2; }
|
|
|
|
|
|
|
|
// encodes the character to a form used to represent it in internal
|
|
|
|
// representation (returns a string in UTF8 version)
|
|
|
|
static wxChar EncodeChar(const wxUniChar& ch) { return (wxChar)ch; }
|
|
|
|
|
|
|
|
static wxUniChar DecodeChar(const wxStringImpl::const_iterator& i)
|
|
|
|
{ return *i; }
|
|
|
|
};
|
|
|
|
#endif // wxUSE_UNICODE_WCHAR || !wxUSE_UNICODE
|
|
|
|
|
|
|
|
|
|
|
|
#if wxUSE_UNICODE_UTF8
|
|
|
|
struct WXDLLIMPEXP_BASE wxStringOperationsUtf8
|
|
|
|
{
|
|
|
|
// checks correctness of UTF-8 sequence
|
2007-05-03 07:05:04 -04:00
|
|
|
static bool IsValidUtf8String(const char *c,
|
|
|
|
size_t len = wxStringImpl::npos);
|
2009-03-21 19:36:37 -04:00
|
|
|
static bool IsValidUtf8LeadByte(unsigned char c)
|
|
|
|
{
|
|
|
|
return (c <= 0x7F) || (c >= 0xC2 && c <= 0xF4);
|
|
|
|
}
|
2007-04-19 06:05:55 -04:00
|
|
|
|
|
|
|
// table of offsets to skip forward when iterating over UTF-8 sequence
|
2007-08-22 05:30:40 -04:00
|
|
|
static const unsigned char ms_utf8IterTable[256];
|
2007-04-19 06:05:55 -04:00
|
|
|
|
|
|
|
|
|
|
|
template<typename Iterator>
|
|
|
|
static void IncIter(Iterator& i)
|
|
|
|
{
|
|
|
|
wxASSERT( IsValidUtf8LeadByte(*i) );
|
|
|
|
i += ms_utf8IterTable[(unsigned char)*i];
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename Iterator>
|
|
|
|
static void DecIter(Iterator& i)
|
|
|
|
{
|
|
|
|
wxASSERT( IsValidUtf8LeadByte(*i) );
|
|
|
|
|
|
|
|
// Non-lead bytes are all in the 0x80..0xBF range (i.e. 10xxxxxx in
|
|
|
|
// binary), so we just have to go back until we hit a byte that is
|
|
|
|
// either < 0x80 (i.e. 0xxxxxxx in binary) or 0xC0..0xFF (11xxxxxx in
|
|
|
|
// binary; this includes some invalid values, but we can ignore it
|
|
|
|
// here, because we assume valid UTF-8 input for the purpose of
|
|
|
|
// efficient implementation).
|
|
|
|
--i;
|
|
|
|
while ( ((*i) & 0xC0) == 0x80 /* 2 highest bits are '10' */ )
|
|
|
|
--i;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename Iterator>
|
2008-01-06 17:27:01 -05:00
|
|
|
static Iterator AddToIter(const Iterator& i, ptrdiff_t n)
|
2007-04-19 06:05:55 -04:00
|
|
|
{
|
|
|
|
Iterator out(i);
|
|
|
|
|
|
|
|
if ( n > 0 )
|
|
|
|
{
|
2008-01-06 17:27:01 -05:00
|
|
|
for ( ptrdiff_t j = 0; j < n; ++j )
|
2007-04-19 06:05:55 -04:00
|
|
|
IncIter(out);
|
|
|
|
}
|
|
|
|
else if ( n < 0 )
|
|
|
|
{
|
2008-01-06 17:27:01 -05:00
|
|
|
for ( ptrdiff_t j = 0; j > n; --j )
|
2007-04-19 06:05:55 -04:00
|
|
|
DecIter(out);
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
template<typename Iterator>
|
2008-01-06 17:27:01 -05:00
|
|
|
static ptrdiff_t DiffIters(Iterator i1, Iterator i2)
|
2007-04-19 06:05:55 -04:00
|
|
|
{
|
2008-01-06 17:27:01 -05:00
|
|
|
ptrdiff_t dist = 0;
|
2007-04-19 06:05:55 -04:00
|
|
|
|
|
|
|
if ( i1 < i2 )
|
|
|
|
{
|
|
|
|
while ( i1 != i2 )
|
|
|
|
{
|
|
|
|
IncIter(i1);
|
|
|
|
dist--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ( i2 < i1 )
|
|
|
|
{
|
|
|
|
while ( i2 != i1 )
|
|
|
|
{
|
|
|
|
IncIter(i2);
|
|
|
|
dist++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return dist;
|
|
|
|
}
|
|
|
|
|
|
|
|
// encodes the character as UTF-8:
|
2007-06-08 16:17:34 -04:00
|
|
|
typedef wxUniChar::Utf8CharBuffer Utf8CharBuffer;
|
|
|
|
static Utf8CharBuffer EncodeChar(const wxUniChar& ch)
|
|
|
|
{ return ch.AsUTF8(); }
|
2007-04-19 06:05:55 -04:00
|
|
|
|
|
|
|
// returns n copies of ch encoded in UTF-8 string
|
|
|
|
static wxCharBuffer EncodeNChars(size_t n, const wxUniChar& ch);
|
|
|
|
|
|
|
|
// returns the length of UTF-8 encoding of the character with lead byte 'c'
|
|
|
|
static size_t GetUtf8CharLength(char c)
|
|
|
|
{
|
|
|
|
wxASSERT( IsValidUtf8LeadByte(c) );
|
|
|
|
return ms_utf8IterTable[(unsigned char)c];
|
|
|
|
}
|
|
|
|
|
|
|
|
// decodes single UTF-8 character from UTF-8 string
|
2007-08-22 05:24:56 -04:00
|
|
|
static wxUniChar DecodeChar(wxStringImpl::const_iterator i)
|
|
|
|
{
|
|
|
|
if ( (unsigned char)*i < 0x80 )
|
|
|
|
return (int)*i;
|
|
|
|
return DecodeNonAsciiChar(i);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static wxUniChar DecodeNonAsciiChar(wxStringImpl::const_iterator i);
|
2007-04-19 06:05:55 -04:00
|
|
|
};
|
|
|
|
#endif // wxUSE_UNICODE_UTF8
|
|
|
|
|
|
|
|
|
|
|
|
#if wxUSE_UNICODE_UTF8
|
|
|
|
typedef wxStringOperationsUtf8 wxStringOperations;
|
|
|
|
#else
|
|
|
|
typedef wxStringOperationsWchar wxStringOperations;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif // _WX_WXSTRINGOPS_H_
|