import math
from collections import Counter
def build_vector(iterable1, iterable2):
counter1 = Counter(iterable1)
counter2 = Counter(iterable2)
all_items = set(counter1.keys()).union(set(counter2.keys()))
vector1 = [counter1[k] for k in all_items]
vector2 = [counter2[k] for k in all_items]
return vector1, vector2
def cosim(v1, v2):
dot_product = sum(n1 * n2 for n1, n2 in zip(v1, v2) )
magnitude1 = math.sqrt(sum(n ** 2 for n in v1))
magnitude2 = math.sqrt(sum(n ** 2 for n in v2))
return dot_product / (magnitude1 * magnitude2)
l1 = "Julie loves me more than Linda loves me".split()
l2 = "Jane likes me more than Julie loves me or".split()
v1, v2 = build_vector(l1, l2)
print(cosim(v1, v2))
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
/**
*
* @author Xiao Ma
* mail : 409791952@qq.com
*
*/
public class SimilarityUtil {
public static double consineTextSimilarity(String[] left, String[] right) {
Map<String, Integer> leftWordCountMap = new HashMap<String, Integer>();
Map<String, Integer> rightWordCountMap = new HashMap<String, Integer>();
Set<String> uniqueSet = new HashSet<String>();
Integer temp = null;
for (String leftWord : left) {
temp = leftWordCountMap.get(leftWord);
if (temp == null) {
leftWordCountMap.put(leftWord, 1);
uniqueSet.add(leftWord);
} else {
leftWordCountMap.put(leftWord, temp + 1);
}
}
for (String rightWord : right) {
temp = rightWordCountMap.get(rightWord);
if (temp == null) {
rightWordCountMap.put(rightWord, 1);
uniqueSet.add(rightWord);
} else {
rightWordCountMap.put(rightWord, temp + 1);
}
}
int[] leftVector = new int[uniqueSet.size()];
int[] rightVector = new int[uniqueSet.size()];
int index = 0;
Integer tempCount = 0;
for (String uniqueWord : uniqueSet) {
tempCount = leftWordCountMap.get(uniqueWord);
leftVector[index] = tempCount == null ? 0 : tempCount;
tempCount = rightWordCountMap.get(uniqueWord);
rightVector[index] = tempCount == null ? 0 : tempCount;
index++;
}
return consineVectorSimilarity(leftVector, rightVector);
}
/**
* The resulting similarity ranges from −1 meaning exactly opposite, to 1
* meaning exactly the same, with 0 usually indicating independence, and
* in-between values indicating intermediate similarity or dissimilarity.
*
* For text matching, the attribute vectors A and B are usually the term
* frequency vectors of the documents. The cosine similarity can be seen as
* a method of normalizing document length during comparison.
*
* In the case of information retrieval, the cosine similarity of two
* documents will range from 0 to 1, since the term frequencies (tf-idf
* weights) cannot be negative. The angle between two term frequency vectors
* cannot be greater than 90°.
*
* @param leftVector
* @param rightVector
* @return
*/
private static double consineVectorSimilarity(int[] leftVector,
int[] rightVector) {
if (leftVector.length != rightVector.length)
return 1;
double dotProduct = 0;
double leftNorm = 0;
double rightNorm = 0;
for (int i = 0; i < leftVector.length; i++) {
dotProduct += leftVector[i] * rightVector[i];
leftNorm += leftVector[i] * leftVector[i];
rightNorm += rightVector[i] * rightVector[i];
}
double result = dotProduct
/ (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
return result;
}
public static void main(String[] args) {
String left[] = { "Julie", "loves", "me", "more", "than", "Linda",
"loves", "me" };
String right[] = { "Jane", "likes", "me", "more", "than", "Julie",
"loves", "me" };
System.out.println(consineTextSimilarity(left,right));
}
}
import math
def dot_prod(v1, v2):
ret = 0
for i in range(len(v1)):
ret += v1[i] * v2[i]
return ret
def magnitude(v):
ret = 0
for i in v:
ret += i**2
return math.sqrt(ret)
def cos_sim(v1, v2):
return (dot_prod(v1, v2)) / (magnitude(v1) * magnitude(v2))