您的位置:首页 > 编程语言 > PHP开发

php利用余弦相似度计算文章的相似度

2017-03-24 16:10 399 查看
1.用php完成余弦相似度算法<?php//分词类 用的是 scwsrequire_once './fenci.php';class Similarity{public $wordArr = array();public $strArr1 = array();public $strArr2 = array();public $vectorStr1 = array();public $vectorStr2 = array();//构造函数public function __construct($str1,$str2){$this->strArr1 = explode(" ",fenci($this->delSymbol($str1)));$this->strArr2 = explode(" ",fenci($this->delSymbol($str2)));$this->wordArr = array_unique(array_merge($this->strArr1, $this->strArr2));$this->vectorStr1 = $this->getVectorStr($this->strArr1,$this->wordArr);$this->vectorStr2 = $this->getVectorStr($this->strArr2,$this->wordArr);}//计算相似度 越接近1 越相似public function similarity(){$sum = 0;$sumT1 = 0;$sumT2 = 0;$vectorStr1 = $this->vectorStr1;$vectorStr2 = $this->vectorStr2;foreach($this->wordArr as $key => $temp){$sum += $vectorStr1[$key] * $vectorStr2[$key];$sumT1 += pow($vectorStr1[$key],2);$sumT2 += pow($vectorStr2[$key],2);}$result = $sum / (sqrt($sumT1) * sqrt($sumT2));return $result;}//生成词频向量数组public function getVectorStr($strArr,$wordArr){$vectorStr = array();foreach($wordArr as $key1 => $temp2){$num = 0;foreach($strArr as $key2 => $temp1){if($temp2 == $temp1){$num++;}}$vectorStr[$key1] = $num;}return $vectorStr;}//去除字符串中的 多余符合提高准确率public function delSymbol($str){$symbolArr = array('​','“','”','"','>','<',' ',' ','`','·','~','!','!','@','#','$','¥','%','^','……','&','*','(',')','(',')','-','_','——','+','=','|','\\','[',']','【','】','{','}',';',';',':',':','\'','"','“','”',',',',','<','>','《','》','.','。','/','、','?','?');return str_replace($symbolArr,'',$str);}}
<?phperror_reporting(E_ALL); require_once './simhash2.php';$s = '“熊猫家源 世界茶源”,雅安主题馆亮相第十六届西博会        	';$s2 = '“雅安主题馆亮相第十六届西博会熊猫家源·世界茶源”         	';$aa = new Similarity($s,$s2);var_dump($aa->similarity());
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  php 算法 相似度 排重