您的位置:首页 > 数据库

SQL2005CLR函数扩展-山寨索引

2009-05-18 19:16 267 查看
本文只是一个山寨试验品,思路仅供参考.





对于文件索引lucene才是权威,这里只是自己实现了一个可以实现简单文件索引的半成品.所谓文件索引就是把sql字符串按字节分词保存到磁盘文件目录结构中用来快速定位.





原理介绍:



索引建立



目录结构划分方案也只是很简易的实现了一下,通过unicode把任意连续的两个字符(中文或英文)分为4个字节来做四层目录,把索引的内容对应的主关键字(主要为了使用sql索引和唯一性)作为文件名,两个字符在索引内容中的位置作为文件后缀来存储.文件本身为0字节,不保存任何信息.



比如一条数据 "pk001","山寨索引"

山寨索引

四个字的unicode为

[0]: 113

[1]: 92

[2]: 232

[3]: 91

[4]: 34

[5]: 125

[6]: 21

[7]: 95

那么对应的文件结构为



../113/92/232/91/pk001

.0

../232/91/34/125/pk001

.1

../34/125/21/95/pk001

.2



索引使用



比如搜索"寨索引

"

则搜索

"../232/91/34/125/"

目录下的所有文件,然后根据

pk001

.1的文件后缀名1,去看

../34/125/21/95/pk001.2文件是否存在.依次类推,最后返回一个结果集.







实用性



具体的实用性还有待验证.这只是实现了精确的like搜索,而不能做常见搜索引擎的分词效果.另外海量数据重建索引的性能也是面临很严峻的问题,比如cpu负载和磁盘io负载.关于windows一个目录下可以保持多少个文件而不会对文件搜索造成大的性能损失也有待评估,不过这个可以考虑根据主键的文件名hash来增加文件目录深度降低单一目录下的文件数量.







演示效果

实现了针对test标的name和caption两个字段作索引搜索.



--

设置和获取索引文件根目录

--select
dbo.xfn_SetMyIndexFileRoot('d:/MyIndex')

--select
dbo.xfn_GetMyIndexFileRoot()

--

建立测试环境

go

create

table

test(

id uniqueidentifier

,

name

nvarchar

(

100),

caption nvarchar

(

100))

insert

into

test select

top

3 newid

(),

'

我的索引

'

,

'

测试

'

from

sysobjects

insert

into

test select

top

3 newid

(),

'

我的测试

'

,

'

索引

'

from

sysobjects

insert

into

test select

top

3 newid

(),

'

测试索引

'

,

'

测试索引

'

from

sysobjects

insert

into

test select

top

3 newid

(),

'

我的索引

'

,

'

索引

'

from

sysobjects

create

index

i_testid on

test(

id)

--

建立索引文件

declare

@t int

select

@t=

dbo.

xfn_SetKeyForMyIndex(

id,

'testIndex'

,

name

+

' '

+

caption)



from

test

--

查询数据

select a.* from test a, dbo. xfn_GetKeyFromMyIndex( '测试 索引 我的' , 'testIndex' ) b

where a. id= b. pk

/*

0C4634EA-DF94-419A-A8E5-793BD5F54EED

我的索引



测试

2DD87B38-CD3F-4F14-BB4A-00678463898F

我的索引



测试

8C67A6C3-753F-474C-97BA-CE85A2455E3E

我的索引



测试

C9706BF1-FB1F-42FB-8A48-69EC37EAD3E5

我的测试



索引

8BBF25CC-9DBB-4FCB-B2EB-D318E587DD5F

我的测试



索引

8B45322D-8E46-4691-961A-CD0078F1FA0A

我的测试



索引

*/

--drop table
test







clr代码如下:编译为MyFullIndex.dll



using

System;

using

System.Data.SqlTypes;

using

Microsoft.SqlServer.Server;

using

System.Collections;

using

System.Collections.Generic;

public

partial

class

UserDefinedFunctions

{


///



<summary>


///



设置索引目录


///



</summary>


///



<param name="value"></param>


///



<returns></returns>


[Microsoft.SqlServer.Server.SqlFunction

]


public

static

SqlBoolean

SetRoot(SqlString

value)


{


if

(value.IsNull) return

false

;


if

(System.IO.Directory

.Exists(value.Value))


{

root = value.Value;

return

true

;


}


else


{

return

false

;


}


}


///



<summary>


///



获取索引目录


///



</summary>


///



<returns></returns>


[Microsoft.SqlServer.Server.SqlFunction

]


public

static

SqlString

GetRoot()


{


return

new

SqlString

(root);


}


///



<summary>


///



建立索引


///



</summary>


///



<param name="key">

主键

</param>


///



<param name="indexName">

索引名称

</param>


///



<param name="content">

索引内容

</param>


///



<returns></returns>


[Microsoft.SqlServer.Server.SqlFunction

]


public

static

SqlInt32

SetIndex(SqlString

key,SqlString

indexName,SqlString

content)


{


if

(key.IsNull ||
content.IsNull||indexName.IsNull) return

0;


return

_setIndex(key.Value,indexName.Value, content.Value);


}




///



<summary>


///



查询索引


///



</summary>


///



<param name="word">

关键字(空格区分)

</param>


///



<param name="indexName">

索引名称

</param>


///



<returns></returns>


[SqlFunction

(TableDefinition = "pk nvarchar(900)"

, Name = "GetIndex"

, FillRowMethodName = "FillRow"

)]


public

static

IEnumerable

GetIndex(SqlString

word,SqlString

indexName)


{




System.Collections.Generic.List

<string

> ret = new

List

<string

>();


if

(word.IsNull || indexName.IsNull) return

ret;




return

_getIndex2(word.Value,
indexName.Value);


}




public

static

void

FillRow(Object

obj, out

SqlString

pk)


{


string

key = obj.ToString();


pk = key;


}






static

string

root = @"d:/index"

;






///



<summary>


///



获取有空格分隔的索引信息


///



</summary>


///



<param name="word"></param>


///



<param name="indexName"></param>


///



<returns></returns>


static

System.Collections.Generic.List

<string

>
_getIndex2(string

word, string

indexName)


{


string

[] arrWord = word.Split(new

char

[] { ' '

}, StringSplitOptions

.RemoveEmptyEntries);




System.Collections.Generic.List

<string

> key_0 = _getIndex(arrWord[0], indexName);




if

(arrWord.Length == 0) return

key_0;


System.Collections.Generic.List

<string

> [] key_list=new

List

<string

>[arrWord.Length-1];


for

(int

i = 0; i < arrWord.Length-1; i++)


{

System.Collections.Generic.List

<string

>
key_i = _getIndex(arrWord[i+1],indexName);

key_list[i] = key_i;


}




for

(int

i=key_0.Count-1;i>=0;i--)


{

foreach

(System.Collections.Generic.List

<string

> key_i in

key_list)

{

if

(key_i.Contains(key_0[i]) == false

)

{

key_0.RemoveAt(i);

continue

;

}

}


}


return

key_0;






}


///



<summary>


///



获取单个词的索引信息


///



</summary>


///



<param name="word"></param>


///



<param name="indexName"></param>


///



<returns></returns>


static

System.Collections.Generic.List

<string

>
_getIndex(string

word, string

indexName)


{


System.Collections.Generic.List

<string

> ret = new

List

<string

>();


byte

[] bWord = System.Text.Encoding

.Unicode.GetBytes(word);


if

(bWord.Length < 4) return

ret;




string

path = string

.Format(@"{0}/{1}/{2}/{3}/{4}/{5}/"

,
root,indexName, bWord[0], bWord[1], bWord[2], bWord[3]);


if

(System.IO.Directory

.Exists(path)
== false

)


{

return

ret;


}


string

[] arrFiles = System.IO.Directory

.GetFiles(path);




foreach

(string

file in

arrFiles)


{

string

key = System.IO.Path

.GetFileNameWithoutExtension(file);

string

index = System.IO.Path

.GetExtension(file).TrimStart(new

char

[] { '.'

});

int

cIndex = int

.Parse(index);

bool

bHas = true

;

for

(int

i = 2; i < bWord.Length - 3; i = i + 2)


{

string

nextFile = string

.Format(@"{0}/{1}/{2}/{3}/{4}/{5}/{6}.{7}"

,

root, indexName, bWord[i +
0], bWord[i + 1], bWord[i + 2], bWord[i + 3], key, ++cIndex);



if

(System.IO.File

.Exists(nextFile) == false

)

{

bHas = false

;

break

;

}

}

if

(bHas == true

&&ret.Contains(key)==false

)

ret.Add(key);




}


return

ret;


}




///



<summary>


///



建立索引文件


///



</summary>


///



<param name="key"></param>


///



<param name="indexName"></param>


///



<param name="content"></param>


///



<returns></returns>


static

int

_setIndex(string

key,string

indexName, string

content)


{


byte

[] bContent = System.Text.Encoding

.Unicode.GetBytes(content);


if

(bContent.Length <= 4) return

0;


for

(int

i = 0; i < bContent.Length - 3; i = i + 2)


{

string

path = string

.Format(@"{0}/{1}/{2}/{3}/{4}/{5}/"

,
root,indexName, bContent[i + 0], bContent[i + 1], bContent[i + 2], bContent[i +
3]);

if

(System.IO.Directory

.Exists(path) == false

)

{

System.IO.Directory

.CreateDirectory(path);

}

string

file = string

.Format(@"{0}/{1}.{2}"

,
path, key, i / 2);



if

(System.IO.File

.Exists(file) == false

)

{

System.IO.File

.Create(file).Close();

}


}


return

content.Length;


}

};



部署的sql脚本如下

--drop
function dbo.xfn_SetMyIndexFileRoot

--drop
function dbo.xfn_GetMyIndexFileRoot

--drop
function dbo.xfn_GetKeyFromMyIndex

--drop
function dbo.xfn_SetKeyForMyIndex

--drop
assembly MyFullIndex

--go

CREATE

ASSEMBLY

MyFullIndex FROM

'd:/SQLCLR/MyFullIndex.dll'

WITH

PERMISSION_SET =

UnSAFE;

--

go

--

索引搜索

CREATE

FUNCTION

dbo.

xfn_GetKeyFromMyIndex (

@word
nvarchar

(

max

),

@indexName nvarchar

(

900))



RETURNS

table

(

pk nvarchar

(

100))

AS

EXTERNAL

NAME

MyFullIndex.

UserDefinedFunctions.

GetIndex

go

--

索引建立

CREATE

FUNCTION

dbo.

xfn_SetKeyForMyIndex (

@pk
nvarchar

(

900),

@indexName nvarchar

(

900),

@word nvarchar

(

max

))



RETURNS

int

AS

EXTERNAL

NAME

MyFullIndex.

UserDefinedFunctions.

SetIndex

go

--

获取索引文件根目录

CREATE

FUNCTION

dbo.

xfn_GetMyIndexFileRoot ()



RETURNS

nvarchar

(

max

)

AS

EXTERNAL

NAME

MyFullIndex.

UserDefinedFunctions.

GetRoot

go

--

设置索引文件根目录(默认目录为

d:/myindex



CREATE

FUNCTION

dbo.

xfn_SetMyIndexFileRoot (

@FileRoot
nvarchar

(

max

))



RETURNS

bit

AS

EXTERNAL

NAME

MyFullIndex.

UserDefinedFunctions.

SetRoot

go
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: