在 Python 中,给定稀疏矩阵数据计算余弦距离的最快方法是什么?

给定一个稀疏矩阵清单,计算矩阵中每个列(或行)之间的余弦距离的最佳方法是什么?我宁愿不迭代 n-select-2次。

假设输入矩阵是:

A=
[0 1 0 0 1
0 0 1 1 1
1 1 0 1 0]

稀疏的代表是:

A =
0, 1
0, 4
1, 2
1, 3
1, 4
2, 0
2, 1
2, 3

在 Python 中,使用矩阵输入格式很简单:

import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine


A = np.array(
[[0, 1, 0, 0, 1],
[0, 0, 1, 1, 1],
[1, 1, 0, 1, 0]])


dist_out = 1-pairwise_distances(A, metric="cosine")
dist_out

给予:

array([[ 1.        ,  0.40824829,  0.40824829],
[ 0.40824829,  1.        ,  0.33333333],
[ 0.40824829,  0.33333333,  1.        ]])

对于完全矩阵输入来说,这很好,但是我真的想从稀疏表示开始(由于矩阵的大小和稀疏性)。对于如何最好地实现这一目标有什么想法吗?先谢谢你。

163814 次浏览

你应该看看 很少,很少。您可以对这些稀疏矩阵应用操作,就像您使用普通矩阵一样。

下面的方法比 scipy.spatial.distance.pdist快约30倍。它在大矩阵上运行得非常快(假设您有足够的 RAM)

请参阅下面关于如何优化稀疏性的讨论。

import numpy as np


# base similarity matrix (all dot products)
# replace this with A.dot(A.T).toarray() for sparse representation
similarity = np.dot(A, A.T)


# squared magnitude of preference vectors (number of occurrences)
square_mag = np.diag(similarity)


# inverse squared magnitude
inv_square_mag = 1 / square_mag


# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0


# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)
    

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
cosine = cosine.T * inv_mag

如果您的问题是典型的大规模二进制首选项问题,那么您在一个维度中的条目比另一个维度多得多。另外,短维度是要计算其项之间相似性的维度。让我们把这个维度称为“项目”维度。

如果是这种情况,在行中列出您的“项目”,并使用 scipy.sparse创建 A。然后按指示替换第一行。

如果你的问题是非典型的,你将需要更多的修改。这些应该是相当简单的基本 numpy操作与其 scipy.sparse等效操作的替代。

嗨,你可以这样做

    temp = sp.coo_matrix((data, (row, col)), shape=(3, 59))
temp1 = temp.tocsr()


#Cosine similarity
row_sums = ((temp1.multiply(temp1)).sum(axis=1))
rows_sums_sqrt = np.array(np.sqrt(row_sums))[:,0]
row_indices, col_indices = temp1.nonzero()
temp1.data /= rows_sums_sqrt[row_indices]
temp2 = temp1.transpose()
temp3 = temp1*temp2

你可以直接使用 sklearn 来计算稀疏矩阵的成对余弦距离。在0.17版本中,它还支持稀疏输出:

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse


A =  np.array([[0, 1, 0, 0, 1], [0, 0, 1, 1, 1],[1, 1, 0, 1, 0]])
A_sparse = sparse.csr_matrix(A)


similarities = cosine_similarity(A_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))


#also can output sparse matrices
similarities_sparse = cosine_similarity(A_sparse,dense_output=False)
print('pairwise sparse output:\n {}\n'.format(similarities_sparse))

结果:

pairwise dense output:
[[ 1.          0.40824829  0.40824829]
[ 0.40824829  1.          0.33333333]
[ 0.40824829  0.33333333  1.        ]]


pairwise sparse output:
(0, 1)  0.408248290464
(0, 2)  0.408248290464
(0, 0)  1.0
(1, 0)  0.408248290464
(1, 2)  0.333333333333
(1, 1)  1.0
(2, 1)  0.333333333333
(2, 0)  0.408248290464
(2, 2)  1.0

如果你想要列的余弦相似性,只需事先调换你的输入矩阵:

A_sparse.transpose()

我把所有的答案都写成了1。验证每个结果(见下面的断言)和2。看看哪个最快。 守则及结果如下:

# Imports
import numpy as np
import scipy.sparse as sp
from scipy.spatial.distance import squareform, pdist
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity


# Create an adjacency matrix
np.random.seed(42)
A = np.random.randint(0, 2, (10000, 100)).astype(float).T


# Make it sparse
rows, cols = np.where(A)
data = np.ones(len(rows))
Asp = sp.csr_matrix((data, (rows, cols)), shape = (rows.max()+1, cols.max()+1))


print "Input data shape:", Asp.shape


# Define a function to calculate the cosine similarities a few different ways
def calc_sim(A, method=1):
if method == 1:
return 1 - squareform(pdist(A, metric='cosine'))
if method == 2:
Anorm = A / np.linalg.norm(A, axis=-1)[:, np.newaxis]
return np.dot(Anorm, Anorm.T)
if method == 3:
Anorm = A / np.linalg.norm(A, axis=-1)[:, np.newaxis]
return linear_kernel(Anorm)
if method == 4:
similarity = np.dot(A, A.T)


# squared magnitude of preference vectors (number of occurrences)
square_mag = np.diag(similarity)


# inverse squared magnitude
inv_square_mag = 1 / square_mag


# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0


# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)


# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
return cosine.T * inv_mag
if method == 5:
'''
Just a version of method 4 that takes in sparse arrays
'''
similarity = A*A.T
square_mag = np.array(A.sum(axis=1))
# inverse squared magnitude
inv_square_mag = 1 / square_mag


# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0


# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag).T


# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = np.array(similarity.multiply(inv_mag))
return cosine * inv_mag.T
if method == 6:
return cosine_similarity(A)


# Assert that all results are consistent with the first model ("truth")
for m in range(1, 7):
if m in [5]: # The sparse case
np.testing.assert_allclose(calc_sim(A, method=1), calc_sim(Asp, method=m))
else:
np.testing.assert_allclose(calc_sim(A, method=1), calc_sim(A, method=m))


# Time them:
print "Method 1"
%timeit calc_sim(A, method=1)
print "Method 2"
%timeit calc_sim(A, method=2)
print "Method 3"
%timeit calc_sim(A, method=3)
print "Method 4"
%timeit calc_sim(A, method=4)
print "Method 5"
%timeit calc_sim(Asp, method=5)
print "Method 6"
%timeit calc_sim(A, method=6)

结果:

Input data shape: (100, 10000)
Method 1
10 loops, best of 3: 71.3 ms per loop
Method 2
100 loops, best of 3: 8.2 ms per loop
Method 3
100 loops, best of 3: 8.6 ms per loop
Method 4
100 loops, best of 3: 2.54 ms per loop
Method 5
10 loops, best of 3: 73.7 ms per loop
Method 6
10 loops, best of 3: 77.3 ms per loop

我已经尝试了以上的一些方法。然而,@zbinsd 的实验有其局限性。实验中使用的矩阵稀疏度极低,而实际稀疏度通常在90% 以上。 在我的情况下,稀疏的形状是(7000,25000)和稀疏的97% 。方法4非常慢,我无法忍受得到结果。我使用的方法6是在10秒内完成的。令人惊讶的是,我尝试了下面的方法,结果只用了0.247秒。

import sklearn.preprocessing as pp


def cosine_similarities(mat):
col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
return col_normed_mat.T * col_normed_mat

这种有效的方法是由 在这里输入链接描述链接的

以瓦利的解决方案为基础:

def sparse_cosine_similarity(sparse_matrix):
out = (sparse_matrix.copy() if type(sparse_matrix) is csr_matrix else
sparse_matrix.tocsr())
squared = out.multiply(out)
sqrt_sum_squared_rows = np.array(np.sqrt(squared.sum(axis=1)))[:, 0]
row_indices, col_indices = out.nonzero()
out.data /= sqrt_sum_squared_rows[row_indices]
return out.dot(out.T)

它接受一个稀疏矩阵(最好是 csr _ Matrix)并返回一个 csr _ Matrix。它应该使用稀疏计算完成更加密集的部分,并且内存开销非常小。不过我还没有进行过广泛的测试,所以买者请注意(更新: 我对这个解决方案有信心,因为我已经对它进行了测试和基准测试)

此外,这里是稀疏版本的 Waylon 的解决方案,以防它帮助任何人,不确定哪种解决方案实际上是更好的。

def sparse_cosine_similarity_b(sparse_matrix):
input_csr_matrix = sparse_matrix.tocsr()
similarity = input_csr_matrix * input_csr_matrix.T
square_mag = similarity.diagonal()
inv_square_mag = 1 / square_mag
inv_square_mag[np.isinf(inv_square_mag)] = 0
inv_mag = np.sqrt(inv_square_mag)
return similarity.multiply(inv_mag).T.multiply(inv_mag)

这两种解决方案似乎都与 skLearn.metrics.pairwise.cosine _ 兵相似

歌曲:-D

更新:

现在,我已经针对我现有的 Cython 实现测试了这两种解决方案: < a href = “ https://github.com/davidmashburn/Sparse _ dot/blob/master/test/ 基准测试 _ v3 _ output _ table.txt”rel = “ nofollow noReferrer”> https://github.com/davidmashburn/sparse_dot/blob/master/test/benchmarks_v3_output_table.txt 看起来第一个算法在三个算法中表现最好。

def norm(vector):
return sqrt(sum(x * x for x in vector))


def cosine_similarity(vec_a, vec_b):
norm_a = norm(vec_a)
norm_b = norm(vec_b)
dot = sum(a * b for a, b in zip(vec_a, vec_b))
return dot / (norm_a * norm_b)

如果一次传入一对向量,这种方法似乎比使用 sklearn 的实现要快一些。

我建议分两步跑:

1)生成映射 A,映射 A: 列索引-> 非零对象

2) for each object i (row) with non-zero occurrences(columns) {k1,..kn} calculate cosine similarity just for elements in the union set A[k1] U A[k2] U.. A[kn]

假设一个大的稀疏矩阵具有高稀疏性,这将获得一个显着的增强超过蛮力

@ Jeff 的解决方案改变了

作为 scikit-learn 1.1.2的版本,您不需要在 cosine_similarity之前使用 scypy 的 sparse

你只需要 cosine_similarity

from typing import Tuple


import numpy as np
import perfplot
import scipy
from sklearn.metrics.pairwise import cosine_similarity as cosine_similarity_sklearn_internal
from scipy import spatial
from scipy import sparse
import sklearn.preprocessing as pp


target_dtype = "float16"




class prettyfloat(float):
def __repr__(self):
return "%.2f" % self




def cosine_similarity_sklearn(x):
return cosine_similarity_sklearn_internal(x)




def cosine_similarity_sklearn_sparse(x):
x_sparse = sparse.csr_matrix(x)


return cosine_similarity_sklearn_internal(x_sparse)




def cosine_similarity_einsum(x, y=None):
"""
Calculate the cosine similarity between two vectors.
if x == y, only use x
"""
# cosine_similarity in einsum notation without astype
normed_x = x / np.linalg.norm(x, axis=1)[:, None]
normed_y = y / np.linalg.norm(y, axis=1)[:, None] if y else normed_x
return np.einsum("ik,jk->ij", normed_x, normed_y)




def cosine_similarity_scipy(x, y=None):
"""
Calculate the cosine similarity between two vectors.
if x == y, only use x
"""
return 1 - spatial.distance.cosine(x, x)




def setup_n(n) -> Tuple[np.ndarray, np.ndarray]:
nd_arr = np.random.randn(int(2 ** n), 512).astype(target_dtype)
return nd_arr




def equality_check(a, b):
if type(a) != np.ndarray:
a = a.todense()
if type(b) != np.ndarray:
b = b.todense()


return np.isclose(a.astype(target_dtype), b.astype(target_dtype), atol=1e-3).all()




fig = perfplot.show(
setup=setup_n,
n_range=[k for k in range(1, 10)],
kernels=[
cosine_similarity_sklearn,
cosine_similarity_sklearn_sparse,
cosine_similarity_einsum,
# cosine_similarity_scipy,
],
labels=["sk-def", "sk+sparse", "einsum"],
logx=False,
logy=False,
xlabel='2^n',
equality_check=equality_check,
)


它显示,通过输入 import Tuple,可以使用 Perfplot

导入 numpy 作为 np 导入完美情节 进口货单进口货单 成对导入余弦 _ 相似度是最好的。

scikit-learn==1.1.2,1.1.3

Float64和 float16中的结果可能不同。

  • 对于花车64, enter image description here

  • 16号花车, enter image description here