您的位置:首页 > 编程语言

SegWord::UString的待完全测试代码

2006-03-02 14:22 330 查看
l UString.h
#ifndef __USTRING_H__
#define __USTRING_H__

#include <list>

/*
* 文件名: UString.h
* 创建日期: 2005-12-12
* 创建者: Percy Lee
* 修改列表:
*
* 说明:
* Unicode string class for c++(in namespace UStr) with basic operation such as:
* length() : get the length of string;
* u_str(): get the unicode char array by pointer;
* c_str(): nil(return NULL);
* resize(Size): set the new capacity(=Size) of string;
* append(str): append the str back;
* sub_ustr(): get the sub-string under the given section;
* find(str): find sub-string str's section list;
* find_overlap(str): find sub-string str which can be overlap-occured in base-string;
* find_first(str): find the first pos of sub-string str;
* be_first(str): judge if str is the very beginning sub-string or not.
* And also it supplies two global functions US_TO_S & S_TO_US
* for transforming unicode string and mutibyte string.
*
* Copyright (c) the Semean Studio.All rights reserved.
* E-mail: semean@163.com
********************************************************************************/
namespace UStr
{
/*
* 联合体: Section
* 说明: 本联合体的对象可记录一个位置,或者一个区间(起始位置与长度)
******************************************************************/
union Section
{
typedef Section value_type;
typedef Section* pointer_type;
typedef Section& reference_type;

size_t _begin;
struct sect
{
unsigned int _begin;
unsigned int _length;
} _sect;
Section(void);
};

typedef long long Int64;
typedef std::list<Section> SectionList;
typedef std::list<Section>::iterator SectListIter;

/*
* 类: UString
* 说明:
* UString具有两种不同类型的对象,一是普通字符串,内部存储空间大小
* 为字符串所需要的空间大小;一是缓存字符串,内部存储空间大小是1K
* 的整数倍(管理策略见实现).
******************************************************************/

enum USType //UString的存储管理类型
{
eUSNormal = 0, //普通字符串
eUSBuffer //用作缓存的字符串
};

class UString
{
public:
typedef UString value_type;
typedef UString* pointer_type;
typedef UString& reference_type;

UString(USType Ustype = eUSNormal);
UString(const UString& Str);
UString(const char* pStr);
UString(const wchar_t* pStr);
~UString(void);
public: //公共接口
size_t length(void) const;
wchar_t* u_str(void) const;
char* c_str(void) const;
void resize(size_t Size);
void append(const UString& Str);
void append(const char* pStr);
void append(const wchar_t* pStr);
void append(const wchar_t* pStr, size_t Begin, size_t Length);
UString sub_ustr(size_t Begin, size_t Length) const;
UString sub_ustr(const Section& Sect) const;
/* 检索(find*)系列的方法
* 说明:
* (1)find与find_overlap返回子串所有出现的位置列表(因长度固定,
* 故联合体只存储首位置).其中find_overlap所检索的子串允许交叉.
* (2)返回所有子串区间的列表仅提供一种方便的手段,但返回列表会
* 影响算法性能(若以引用参数输出STL list,却在DLL输出时有存储
* 管理的异常).故某些情况下可以使用find_first遍历所有子串.
* (3)本系列函数所实现的算法有待严格的大规模的测试.
* percylee 2006/3
**************************************************************************/
SectionList find(const UString& Str) const;
SectionList find(wchar_t WCh) const;
SectionList find(const wchar_t* pStr,size_t Begin, size_t Length) const;
SectionList find_overlap(const UString& Str) const;
SectionList find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const;
Int64 find_first(const UString& Str,size_t Start) const;
Int64 find_first(wchar_t WCh, size_t Start) const;
Int64 find_first(const wchar_t* pStr,size_t Begin, size_t Length, size_t Start) const;
/**************************************************************************/
bool be_first(const UString& Str,size_t Start) const;
bool be_first(const wchar_t* pStr,size_t Begin,size_t Length,size_t Start) const;
public: //操作符重载
UString& operator =(const UString& Str);
UString& operator =(const char* pStr);
UString& operator =(const wchar_t* pStr);
wchar_t& operator [](size_t pos);

friend bool operator == ( const UString& Str1, const UString& Str2 );
friend bool operator < ( const UString& Str1, const UString& Str2 );
friend bool operator <= ( const UString& Str1, const UString& Str2 );
friend bool operator > ( const UString& Str1, const UString& Str2 );
friend bool operator >= ( const UString& Str1, const UString& Str2 );
private:
wchar_t* _pUStr;
size_t _length;
size_t _capacity;
USType _usType;
private:
void set_ustring(const UString& Str);
void set_ustring(const char* pStr);
void set_ustring(const wchar_t* pStr);
void default_construct(size_t DefaultSize = 0);
size_t d_capacity(size_t Size);
};

/*
* 多字节流与unicode字符串的全局转换函数
* 说明:
* 对于S_TO_US,需满足UStrLen>=StrLen;
* 对于US_TO_S,需满足StrLen>=UStrLen*2
**************************************************************************/
size_t S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen);
size_t US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen);
};

#endif //__USTRING_H__

l UString.cpp

#include "StdAfx.h"
#include "./ustring.h"
#include <stdlib.h>
#include <locale.h>

/*
* 内部使用的全局变量与全局函数
*************************************************************************/
const unsigned int DEFAULT_CAPACITY = 1024;

// KMP模式匹配算法的后移向量计算
inline size_t* KMPNext(const wchar_t* pStr, size_t Length)
{
if( !pStr || Length <= 0 )
{
return NULL;
}
size_t* pN = new size_t[Length];
if( !pN )
{
return NULL;
}
size_t var = 0;
pN[0] = 0;

for( size_t i = 1; i < Length; i ++ )
{
var = pN[i-1];
while( var > 0 && pStr[i] != pStr[var] )
{
var = pN[var-1];
}

if( pStr[i] == pStr[var] )
{
pN[i] = var + 1;
}
else
{
pN[i] = 0;
}
}

return pN;
}

inline size_t* KMPNext(const UStr::UString& Str)
{
return KMPNext(Str.u_str(),Str.length());
}

/*
* UStr名空间内的类实现
*************************************************************************/
UStr::Section::Section(void)
{
_begin = 0;
_sect._begin = 0;
_sect._length = 0;
}

UStr::UString::UString(USType Ustype/* = eUSNormal*/)
{
_usType = Ustype;
default_construct();
}

UStr::UString::UString(const UString& Str)
{
_usType = eUSNormal;
default_construct(((UString&)Str).length());
set_ustring(Str);
}

UStr::UString::UString(const char* pStr)
{
_usType = eUSNormal;
if( pStr )
{
default_construct(strlen(pStr));
set_ustring(pStr);
}
else
{
default_construct();
}
}

UStr::UString::UString(const wchar_t* pStr)
{
_usType = eUSNormal;
if( pStr )
{
default_construct(wcslen(pStr));
set_ustring(pStr);
}
else
{
default_construct();
}
}

UStr::UString::~UString(void)
{
delete[] _pUStr;
_length = 0;
}

size_t UStr::UString::length(void) const
{
return _length;
}

wchar_t* UStr::UString::u_str(void) const
{
return _pUStr;
}

char* UStr::UString::c_str(void) const
{
return NULL;
}

void UStr::UString::resize(size_t Size)
{
delete[] _pUStr;
_length = 0;
default_construct(Size);
}

void UStr::UString::append(const UString& Str)
{
size_t len = Str.length();
if( len <= 0 )
{
return;
}

append(Str.u_str(),0,len);
}

void UStr::UString::append(const char* pStr)
{
if( !pStr )
{
return;
}
UString ustr(pStr);
append(ustr);
}

void UStr::UString::append(const wchar_t* pStr)
{
if( !pStr )
{
return;
}
size_t len = wcslen(pStr);
if( len <= 0 )
{
return;
}

append(pStr,0,len);
}

void UStr::UString::append(const wchar_t* pStr, size_t Begin, size_t Length)
{
if( !pStr || Length <= 0 )
{
return;
}

if( _capacity < _length + Length )
{
_capacity += d_capacity(Length);
wchar_t* pBuf = new wchar_t[_capacity+1];
wcsncpy(pBuf,_pUStr,_length);
for(size_t i = 0; i < Length; i ++)
{
pBuf[_length+i] = pStr[Begin+i];
}
_length += Length;
pBuf[_length] = L'/0';

delete[] _pUStr;
_pUStr = pBuf;
pBuf = NULL;
}
else
{
for( size_t i = 0; i < Length; i ++ )
{
_pUStr[_length+i] = pStr[Begin+i];
}
_length += Length;
_pUStr[_length] = L'/0';
}
}

UStr::UString UStr::UString::sub_ustr(size_t Begin, size_t Length) const
{
UString ustr;
if( _length < Begin + Length )
{
return ustr;
}
ustr.append(_pUStr,Begin,Length);

return ustr;
}

UStr::UString UStr::UString::sub_ustr(const UStr::Section& Sect) const
{
UString ustr;
if( _length < Sect._sect._begin + Sect._sect._length )
{
return ustr;
}
ustr.append(_pUStr,Sect._sect._begin,Sect._sect._length);

return ustr;
}

UStr::SectionList UStr::UString::find(const UString& Str) const
{
return find(Str.u_str(),0,Str.length());
}

UStr::SectionList UStr::UString::find(wchar_t WCh) const
{
SectionList ustrList;
Section aSection;
for( size_t i = 0; i < _length; i ++ )
{
if( _pUStr[i] == WCh )
{
aSection._begin = i;
ustrList.push_back(aSection);
}
}

return ustrList;
}

UStr::SectionList UStr::UString::find(const wchar_t* pStr,size_t Begin, size_t Length) const
{
SectionList ustrList;
if( Length > _length )
{
return ustrList;
}
const wchar_t* pStrBegin = pStr+Begin;
size_t* pKMPNext = KMPNext(pStrBegin,Length);
if( !pKMPNext )
{
return ustrList;
}
Section aSection;
size_t strPos = 0;
for( size_t i = 0; i < _length; i ++ )
{
while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )
{
strPos = pKMPNext[strPos-1];
}
if( pStrBegin[strPos] == _pUStr[i] )
{
strPos ++;
}
if( strPos == Length )
{
aSection._begin = i - Length + 1;
ustrList.push_back(aSection);
strPos = 0; //start form the very beginning of pKMPNext
}
}

delete[] pKMPNext;
return ustrList;
}

UStr::SectionList UStr::UString::find_overlap(const UString& Str) const
{
return find_overlap(Str.u_str(),0,Str.length());
}

UStr::SectionList UStr::UString::find_overlap(const wchar_t* pStr,size_t Begin, size_t Length) const
{
SectionList ustrList;
if( Length > _length )
{
return ustrList;
}
const wchar_t* pStrBegin = pStr+Begin;
size_t* pKMPNext = KMPNext(pStrBegin,Length);
if( !pKMPNext )
{
return ustrList;
}
Section aSection;
size_t strPos = 0;
for( size_t i = 0; i < _length; i ++ )
{
while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )
{
strPos = pKMPNext[strPos-1];
}
if( pStrBegin[strPos] == _pUStr[i] )
{
strPos ++;
}
if( strPos == Length )
{
aSection._begin = i - Length + 1;
ustrList.push_back(aSection);
strPos = 0; //start form the very beginning of pKMPNext
i = i - Length + 1;//overlap
}
}

delete[] pKMPNext;
return ustrList;
}

UStr::Int64 UStr::UString::find_first(const UStr::UString& Str,size_t Start) const
{
size_t length = Str.length();
if( _length < Start+length )
{
return -1;
}

return find_first(Str.u_str(),0,length,Start);
}

UStr::Int64 UStr::UString::find_first(wchar_t WCh, size_t Start) const
{
if( _length <= Start )
{
return -1;
}

for( size_t i = Start; i < _length; i ++ )
{
if( _pUStr[i] == WCh )
{
return (Int64)i;
}
}

return -1;
}

UStr::Int64 UStr::UString::find_first(const wchar_t* pStr,
size_t Begin,
size_t Length,
size_t Start
) const
{
if( _length < Start+Length )
{
return -1;
}
const wchar_t* pStrBegin = pStr+Begin;
size_t* pKMPNext = KMPNext(pStrBegin,Length);
if( !pKMPNext )
{
return -1;
}
size_t strPos = 0;
for( size_t i = Start; i < _length; i ++ )
{
while( pStrBegin[strPos] != _pUStr[i] && strPos > 0 )
{
strPos = pKMPNext[strPos-1];
}
if( pStrBegin[strPos] == _pUStr[i] )
{
strPos ++;
}
if( strPos == Length )
{
delete[] pKMPNext;
return (Int64)((Int64)i-(Int64)Length+1);
}
}

delete[] pKMPNext;
return -1;
}

bool UStr::UString::be_first(const UString& Str,size_t Start) const
{
return be_first(Str.u_str(),0,Str.length(),Start);
}

bool UStr::UString::be_first(const wchar_t* pStr,
size_t Begin,
size_t Length,
size_t Start
) const
{
if( _length < Start+Length )
{
return false;
}
for( size_t i = 0; i < Length; i ++ )
{
if( _pUStr[Start+i] != pStr[Begin+i] )
{
return false;
}
}

return true;
}

UStr::UString& UStr::UString::operator =(const UString& Str)
{
if( this != &Str )
{
set_ustring(Str);
}
return *this;
}

UStr::UString& UStr::UString::operator =(const char* pStr)
{
if( pStr )
{
set_ustring(pStr);
}
else
{
delete[] _pUStr;
default_construct();
}
return *this;
}

UStr::UString& UStr::UString::operator =(const wchar_t* pStr)
{
if( pStr && pStr != this->_pUStr )
{
set_ustring(pStr);
}
else if( !pStr )
{
delete[] _pUStr;
default_construct();
}
return *this;
}

wchar_t& UStr::UString::operator [](size_t pos)
{
static wchar_t wch;
if( pos >= _length )
{
return wch;
}

return _pUStr[pos];
}

void UStr::UString::set_ustring(const UString& Str)
{
_length = Str.length();
if( _capacity < _length )
{
delete[] _pUStr;
_capacity = d_capacity(_length);
_pUStr = new wchar_t[ _capacity+1 ];
}

wcsncpy(_pUStr,Str.u_str(),_length);
_pUStr[_length] = L'/0';
}

void UStr::UString::set_ustring(const char* pStr)
{
size_t len = strlen(pStr);
if( _capacity < len )
{
delete[] _pUStr;
_capacity = d_capacity(len);
_pUStr = new wchar_t[_capacity+1];
}

(void)setlocale(LC_ALL,"");

_length = mbstowcs(_pUStr,pStr,len*sizeof(char));
_pUStr[ _length ] = L'/0';
}

void UStr::UString::set_ustring(const wchar_t* pStr)
{
_length = wcslen(pStr);
if( _capacity < _length )
{
delete[] _pUStr;
_capacity = d_capacity(_length);
_pUStr = new wchar_t[_capacity+1];
}

wcsncpy( _pUStr,pStr,_length );
_pUStr[_length] = L'/0';
}

void UStr::UString::default_construct(size_t DefaultSize/* = 0*/)
{
_capacity = d_capacity(DefaultSize);
if( _pUStr = new wchar_t[_capacity+1] )
{
_pUStr[0] = L'/0';
}
else
{
_capacity = 0;
}

_length = 0;
}

size_t UStr::UString::d_capacity(size_t Size)
{
if( _usType == eUSNormal )
{
return Size+1;
}
else //if( _usType == eUSBuffer )
{
return (Size/DEFAULT_CAPACITY + 1)*DEFAULT_CAPACITY;
}
}

/*
* UString友元比较函数
**************************************************************************************/

bool UStr::operator == ( const UStr::UString& Str1, const UStr::UString& Str2 )
{
size_t len = Str1.length();
if( len != Str2.length() )
{
return false;
}

wchar_t* pStr1 = Str1.u_str();
wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) != 0 )
{
return false;
}

return true;
}

bool UStr::operator < ( const UStr::UString& Str1, const UStr::UString& Str2 )
{
size_t length1 = Str1.length();
size_t length2 = Str2.length();
if( length1 < length2 )
{
return true;
}
else if( length2 < length1 )
{
return false;
}
else
{
wchar_t* pStr1 = Str1.u_str();
wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) < 0 )
{
return true;
}

return false;
}
}

bool UStr::operator <= ( const UStr::UString& Str1, const UStr::UString& Str2 )
{
size_t length1 = Str1.length();
size_t length2 = Str2.length();
if( length1 < length2 )
{
return true;
}
else if( length2 < length1 )
{
return false;
}
else
{
wchar_t* pStr1 = Str1.u_str();
wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) <= 0 )
{
return true;
}

return false;
}
}

bool UStr::operator > ( const UStr::UString& Str1, const UStr::UString& Str2 )
{
size_t length1 = Str1.length();
size_t length2 = Str2.length();
if( length1 > length2 )
{
return true;
}
else if( length2 > length1 )
{
return false;
}
else
{
wchar_t* pStr1 = Str1.u_str();
wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) > 0 )
{
return true;
}

return false;
}
}

bool UStr::operator >= ( const UStr::UString& Str1, const UStr::UString& Str2 )
{
size_t length1 = Str1.length();
size_t length2 = Str2.length();
if( length1 > length2 )
{
return true;
}
else if( length2 > length1 )
{
return false;
}
else
{
wchar_t* pStr1 = Str1.u_str();
wchar_t* pStr2 = Str2.u_str();

if( wcscmp(pStr1,pStr2) >= 0 )
{
return true;
}

return false;
}
}


/*
* UStr名空间内的全局函数的实现
************************************************************************************/
size_t UStr::S_TO_US(char* pStr,size_t StrLen,wchar_t* pUStr,size_t UStrLen)
{
if( !pStr || !pUStr || UStrLen < StrLen )
{
return 0;
}

char* pStr2 = new char[StrLen+1];
strncpy(pStr2,pStr,StrLen);
pStr2[StrLen] = '/0';

(void)setlocale(LC_ALL,"");

size_t len = mbstowcs(pUStr,pStr2,StrLen*sizeof(char));
pUStr[ len ] = L'/0';

delete[] pStr2;
return len;
}

size_t UStr::US_TO_S(wchar_t* pUStr,size_t UStrLen,char* pStr,size_t StrLen)
{
if( !pStr || !pUStr || StrLen < 2*UStrLen )
{
return 0;
}

wchar_t* pUStr2 = new wchar_t[UStrLen+1];
wcsncpy(pUStr2,pUStr,UStrLen);
pUStr2[UStrLen] = L'/0';

(void)setlocale(LC_ALL,"");

size_t len = wcstombs(pStr,pUStr2,UStrLen*sizeof(wchar_t));
pStr[ len ] = '/0';

delete[] pUStr2;
return len;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: