您的位置:首页 > 其它

看看str系列函数的实现 一

2007-12-26 16:14 323 查看
很多面试题都会提到自己来实现一个函数,strcmp,strcpy,strstr……
我们来看一下一下函数的实现:

int
strcmp (p1, p2)
const char *p1;
const char *p2;
{
register const unsigned char *s1 = (const unsigned char *) p1;
//register 表示向编译器建议使用高速寄存器存储变量
//const 表示指向的字符串不被修改
//unsigned 表示支持unicode字符
//char 就不用说了^_^
register const unsigned char *s2 = (const unsigned char *) p2;
unsigned reg_char c1, c2;

do
{
c1 = (unsigned char) *s1++;
c2 = (unsigned char) *s2++;
if (c1 == '/0')
return c1 - c2;//比较的精华所在
}
while (c1 == c2);

return c1 - c2;
}
早上偶想了一上午看了这个代码才知道差距所在,do while 语句莫有想到 c1-c2 也莫有想到.

下面来看个稍微复杂点的,strstr,偶们公司面试经常出这道题,不少所谓的英雄好汉折戟沉沙于此,其实不最求效率的话,
两个for循环也可以搞定,最求效率就要看这个了.

typedef unsigned chartype;

char *
strstr (phaystack, pneedle)
const char *phaystack;
const char *pneedle;
{
const unsigned char *haystack, *needle;
chartype b;
const unsigned char *rneedle;

haystack = (const unsigned char *) phaystack;

if ((b = *(needle = (const unsigned char *) pneedle)))
{
chartype c;
haystack--; /* possible ANSI violation */
//这个是为了后面*++haystack 否则第一位获取不到
{
chartype a;
do
if (!(a = *++haystack))//如果无法赋值,说明phaystack是空字符串
goto ret0;
while (a != b);//找到第一个ab字符串相等的位置
}

if (!(c = *++needle))//比较字符串向后移一位
goto foundneedle;//如果无法赋值说明比较字符串只有一位,返回比较得到的位置
++needle;//很疑惑这里卫生么要+1,但是看到后面jin部分的代码,c的值已经做过保存,+1用得很漂亮
goto jin;

for (;;)
{
{
chartype a;//这个定义也是非常的搞,卫生么不定义在函数开头?
if (0)//下面的jin:就是说在for循环内永远无法执行,但是可以被跳转
jin:{
if ((a = *++haystack) == c)//如果两个相等则比较剩余字符
goto crest;
}
else
a = *++haystack;//赋值
do
{
for (; a != b; a = *++haystack)//b是比较字符串的第一位 c是第二位
{
if (!a)
goto ret0;
if ((a = *++haystack) == b)//第一位相等
break;
if (!a)
goto ret0;
}
}
while ((a = *++haystack) != c);//如果能够跳出这个循环说明第二位也相等
}
crest://从第三位开始处理
{
chartype a;
{
const unsigned char *rhaystack;
if (*(rhaystack = haystack-- + 1) == (a = *(rneedle = needle)))//如果当前的字符相等
do
{
if (!a)//如果a==0 表示到了字符串尾
goto foundneedle;
if (*++rhaystack != (a = *++needle))//如果下一位不等 跳出 do while循环
break;
if (!a)//如果a==0 表示到了字符串尾
goto foundneedle;
}
while (*++rhaystack == (a = *++needle));//如果相等 继续比较
needle = rneedle; /* took the register-poor aproach */
//到了这里表示比较失败需要回溯 needle 指针回到原来的值
}
if (!a)//如果a==0 跳出for循环 会直接到foundneedle 标签 所以如果使用goto foundneedle; 我想也没有什么关系
break;
}
}
}
foundneedle:
return (char *) haystack;
ret0:
return 0;
}

看完这段代码,莫有惊喜,从作者自信满满的口气中(Until someone tells me otherwise, I assume that this is the
fastest implementation of strstr() in C),我还以为至少效率会超过kmp的,看完以后,这只不过是个精巧的回溯算法罢了,
多次用到的go to 也末有什么特色,不过这个人很有意思的一点是不写注释也就罢了,还要故意说出来(I deliberately chose not to comment it. You should have at least
as much fun trying to understand it, as I had to write it :-).) 鄙视你

附另一个人写的,貌似加州大学的一个教授应该是solaris上的实现

char *
strstr(string, substring)
register char *string; /* String to search. */
char *substring; /* Substring to try to find in string. */
{
register char *a, *b;

/* First scan quickly through the two strings looking for a
* single-character match. When it's found, then compare the
* rest of the substring.
*/

b = substring;
if (*b == 0) {
return string;
}//如果查找字符串不存在直接返回string
for ( ; *string != 0; string += 1) {
if (*string != *b) { //如果第一位不相等 比较下一位
continue;
}
a = string;
while (1) {
if (*b == 0) { //到了查找字符串结尾,说明已经找到
return string;
}
if (*a++ != *b++) { //有不相同的,跳出循环
break;
}
}
b = substring; //恢复查找字符串
}
return (char *) 0;
}

这个算法看上去方方正正,冲正平和,让人一看上去就觉得程序就应该是这么写的,而你自己放手去写却写不出这个味道来,所谓大巧
不工,莫过于此.
同是回溯算法,我真的不认为第一个除了神叨叨的故弄玄虚了半天,效率上有多少提升.

再来看看strlen,偶自信满满的写了一个先:

size_t strlen(str)
const char* str;
{
register const unsigned char * s=str ;
while(*s !=0)
s++ ;
return s-str ;
}

自我感觉相当良好,再看看人家写的,完全不懂! 我靠 偶整整落后一百年!

/* Return the length of the null-terminated string STR. Scan for
the null terminator quickly by testing four bytes at a time. */
size_t
strlen (str)
const char *str;
{
const char *char_ptr;
const unsigned long int *longword_ptr;
unsigned long int longword, magic_bits, himagic, lomagic;

/* Handle the first few characters by reading one character at a time.
Do this until CHAR_PTR is aligned on a longword boundary. */
for (char_ptr = str; ((unsigned long int) char_ptr
& (sizeof (longword) - 1)) != 0;
++char_ptr)
if (*char_ptr == '/0')
return char_ptr - str;

/* All these elucidatory comments refer to 4-byte longwords,
but the theory applies equally well to 8-byte longwords. */

longword_ptr = (unsigned long int *) char_ptr;

/* Bits 31, 24, 16, and 8 of this number are zero. Call these bits
the "holes." Note that there is a hole just to the left of
each byte, with an extra at the end:

bits: 01111110 11111110 11111110 11111111
bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD

The 1-bits make sure that carries propagate to the next 0-bit.
The 0-bits provide holes for carries to fall into. */
magic_bits = 0x7efefeffL;
himagic = 0x80808080L;
lomagic = 0x01010101L;
if (sizeof (longword) > 4)
{
/* 64-bit version of the magic. */
/* Do the shift in two steps to avoid a warning if long has 32 bits. */
magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL;
himagic = ((himagic << 16) << 16) | himagic;
lomagic = ((lomagic << 16) << 16) | lomagic;
}
if (sizeof (longword) > 8)
abort ();

/* Instead of the traditional loop which tests each character,
we will test a longword at a time. The tricky part is testing
if *any of the four* bytes in the longword in question are zero. */
for (;;)
{
/* We tentatively exit the loop if adding MAGIC_BITS to
LONGWORD fails to change any of the hole bits of LONGWORD.

1) Is this safe? Will it catch all the zero bytes?
Suppose there is a byte with all zeros. Any carry bits
propagating from its left will fall into the hole at its
least significant bit and stop. Since there will be no
carry from its most significant bit, the LSB of the
byte to the left will be unchanged, and the zero will be
detected.

2) Is this worthwhile? Will it ignore everything except
zero bytes? Suppose every byte of LONGWORD has a bit set
somewhere. There will be a carry into bit 8. If bit 8
is set, this will carry into bit 16. If bit 8 is clear,
one of bits 9-15 must be set, so there will be a carry
into bit 16. Similarly, there will be a carry into bit
24. If one of bits 24-30 is set, there will be a carry
into bit 31, so all of the hole bits will be changed.

The one misfire occurs when bits 24-30 are clear and bit
31 is set; in this case, the hole at bit 31 is not
changed. If we had access to the processor carry flag,
we could close this loophole by putting the fourth hole
at bit 32!

So it ignores everything except 128's, when they're aligned
properly. */

longword = *longword_ptr++;

if (
#if 0
/* Add MAGIC_BITS to LONGWORD. */
(((longword + magic_bits)

/* Set those bits that were unchanged by the addition. */
^ ~longword)

/* Look at only the hole bits. If any of the hole bits
are unchanged, most likely one of the bytes was a
zero. */
& ~magic_bits)
#else
((longword - lomagic) & himagic)
#endif
!= 0)
{
/* Which of the bytes was the zero? If none of them were, it was
a misfire; continue the search. */

const char *cp = (const char *) (longword_ptr - 1);

if (cp[0] == 0)
return cp - str;
if (cp[1] == 0)
return cp - str + 1;
if (cp[2] == 0)
return cp - str + 2;
if (cp[3] == 0)
return cp - str + 3;
if (sizeof (longword) > 4)
{
if (cp[4] == 0)
return cp - str + 4;
if (cp[5] == 0)
return cp - str + 5;
if (cp[6] == 0)
return cp - str + 6;
if (cp[7] == 0)
return cp - str + 7;
}
}
}
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: