给定若干向量(query vectors),在向量库(keys vectors)中找到和它欧式距离相近的 top 个向量.
1. W1 - 矩阵
import numpy as np
from scipy.spatial.distance import cdist
#
query_matrix = '' #NxC
keys_matrix = '' #NxC
keys_labels = '' #Nx1
topk = 100 #
dist = cdist(query_matrix, keys_matrix,metric='euclidean')
nearest = keys_labels[np.argsort(dist,axis=1)[:,:topk]]
2. W2 - 向量
from typing import Dict, List
import numpy as np
import scipy
def vector_distance(
vec1: np.ndarray,
vec2: np.ndarray,
method: str = "l2",
l2_normalize: bool = True,
) -> float:
"""
Computes the distance between 2 vectors
Args:
vec1: First vector between which the distance will be computed
vec2: Second vector
method: Type of distance to be computed, e.g. "l1" or "l2"
l2_normalize: Flag indicating whether the vectors should be normalized
to be of unit length before the distance between them is computed
Returns: Distance between the 2 input vectors
"""
# Pre-processing
if l2_normalize:
vec1 = vec1 / np.linalg.norm(vec1, 2)
vec2 = vec2 / np.linalg.norm(vec2, 2)
# Distance computation
vecDiff = vec1 - vec2
method = method.lower()
if method == "l1":
dist = sum(abs(vecDiff))
elif method == "l2":
dist = np.linalg.norm(vecDiff, 2)
elif method == "normalizedl2":
a = vec1 / np.linalg.norm(vec1, 2)
b = vec2 / np.linalg.norm(vec2, 2)
dist = np.linalg.norm(a - b, 2)
elif method == "cosine":
dist = scipy.spatial.distance.cosine(vec1, vec2)
elif method == "correlation":
dist = scipy.spatial.distance.correlation(vec1, vec2)
elif method == "chisquared":
dist = scipy.chiSquared(vec1, vec2)
elif method == "normalizedchisquared":
a = vec1 / sum(vec1)
b = vec2 / sum(vec2)
dist = scipy.chiSquared(a, b)
elif method == "hamming":
dist = scipy.spatial.distance.hamming(vec1 > 0, vec2 > 0)
else:
raise Exception("Distance method unknown: " + method)
return dist
def compute_distances(
query_feature: np.array, feature_dict: dict, method: str = "l2"
) -> List:
"""
Computes the distance between query_image and all the images present in
feature_dict (query_image included)
Args:
query_feature: Features for the query image
feature_dict: Dictionary of features, where key = image path and value = array of floats
method: distance method
Returns: List of (image path, distance) pairs.
"""
distances = []
for im_path, feature in feature_dict.items():
distance = vector_distance(query_feature, feature, method)
distances.append((im_path, distance))
return distances
3. 向量计算 vs 最近邻
import numpy as np
from sklearn.neighbors import NearestNeighbors
# Get random query image
query_im_path = '/path/to/images'
query_feature = train_features[query_im_path]
assert len(query_feature) == 512 #特征维度
#
valid_features = dict() # key: image_path, value: feature
valid_image_paths = list() #
#构建NN
#将所有特征归一化(到单位长度)
valid_features_list = np.array(list(valid_features.values()))
valid_features_list /= np.linalg.norm(valid_features_list, axis=1)[:,None]
#构建NN对象
nn = NearestNeighbors(algorithm='auto', metric='euclidean', n_neighbors=min(100,len(valid_features_list)))
nn.fit(valid_features_list)
#NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
# metric_params=None, n_jobs=None, n_neighbors=100, p=2,
# radius=1.0)
#逐一暴力搜索,查找最匹配项
bf_distances_and_paths = compute_distances(query_feature, valid_features)
bf_distances = [d for (p,d) in bf_distances_and_paths]
bf_closest_match_path = bf_distances_and_paths[np.argmin(bf_distances)][0]
#采用NN(nearest-neighbor search),查找最匹配项
query_feature /= np.linalg.norm(query_feature, 2)
query_feature = np.reshape(query_feature, (-1, len(query_feature)))
approx_distances, approx_im_indices = nn.kneighbors(query_feature)
approx_im_paths = [valid_image_paths[i] for i in approx_im_indices[0]]
#对比
rank = np.where(np.array(approx_im_paths) == bf_closest_match_path)[0]
assert len(rank) == 1
assert approx_im_paths[int(rank)] == bf_closest_match_path