您的位置:首页 > 编程语言 > Java开发

余弦相似度-java代码实现

2015-08-01 21:12 876 查看
package sim;

import java.util.HashMap;
import java.util.Map;
import java.util.Set;

/**
 * 字符串相似性匹配算法
 * Created by panther on 15-7-20.
 */
public class Similarity {
    Map<Character, int[]> vectorMap = new HashMap<Character, int[]>();

    int[] tempArray = null;

    public Similarity(String string1, String string2) {

        for (Character character1 : string1.toCharArray()) {
            if (vectorMap.containsKey(character1)) {
                vectorMap.get(character1)[0]++;
            } else {
                tempArray = new int[2];
                tempArray[0] = 1;
                tempArray[1] = 0;
                vectorMap.put(character1, tempArray);
            }
        }
        for (Character character2 : string2.toCharArray()) {
            if (vectorMap.containsKey(character2)) {
                vectorMap.get(character2)[1]++;
            } else {
                tempArray = new int[2];
                tempArray[0] = 0;
                tempArray[1] = 1;
                vectorMap.put(character2, tempArray);
            }
        }
    }

    // 求余弦相似度
    public double sim() {
        double result = 0;
        result = pointMulti(vectorMap) / sqrtMulti(vectorMap);
        return result;
    }

    private double sqrtMulti(Map<Character, int[]> paramMap) {
        double result = 0;
        result = squares(paramMap);
        result = Math.sqrt(result);
        return result;
    }

    // 求平方和
    private double squares(Map<Character, int[]> paramMap) {
        double result1 = 0;
        double result2 = 0;
        Set<Character> keySet = paramMap.keySet();
        for (Character character : keySet) {
            int temp[] = paramMap.get(character);
            result1 += (temp[0] * temp[0]);
            result2 += (temp[1] * temp[1]);
        }
        return result1 * result2;
    }

    // 点乘法
    private double pointMulti(Map<Character, int[]> paramMap) {
        double result = 0;
        Set<Character> keySet = paramMap.keySet();
        for (Character character : keySet) {
            int temp[] = paramMap.get(character);
            result += (temp[0] * temp[1]);
        }
        return result;
    }

    public static void main(String[] args) {
        String s1 = "我是一个帅哥";
        String s2 = "帅哥是我";
        Similarity similarity = new Similarity(s1, s2);
        System.out.println(similarity.sim());
    }

}


输出结果:



分析:

字符串s1中的内容是“我是一个帅哥”,这个字符串中对应的向量名称为<我,是,一,个,帅,哥>,这个字符串的值为<1,1,1,1,1,1>,字符串s2对应的值为<1,1,0,0,1,1>,向量s1点乘向量s2的结果为1*1+1*1+1*0+1*0+1*1+1*1 = 4,

向量s1的模为根号6,向量s2的模为2,所以相似度的结果为0.81
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: