Faiss - Kmeans及KNN使用示例

May 6, 2021

399views

9795 words

from __future__ import print_function
from __future__ import division

import numpy as np
import faiss
import sys
import time
import warnings

#屏蔽一些警告输出
if not sys.warnoptions:
    # suppress pesky PIL EXIF warnings
    warnings.simplefilter("once")
    warnings.filterwarnings("ignore", message="(Possibly )?corrupt EXIF data.*")
    warnings.filterwarnings("ignore", message="numpy.dtype size changed.*")
    warnings.filterwarnings("ignore", message="numpy.ufunc size changed.*")


def reserve_faiss_gpu_memory(gpu_id=0):
    """
    设定 Faiss 所使用的显卡和显存.
    Reserves around 2.4 Gb memory on Titan Xp.
    `r = reserve_faiss_gpu_memory()`
    To release the memory run `del r`
    Something like 200 Mb will still be hold afterwards.
    """
    res = faiss.StandardGpuResources()
    cfg = faiss.GpuIndexFlatConfig()
    cfg.useFloat16 = False
    cfg.device = gpu_id
    index = faiss.GpuIndexFlatL2(res, 2048, cfg)
    return index, res


class MemoryReserver():
    """
    Faiss 显存管理.
    Faiss memory manager. 
    If not used and another process takes up memory of
    currently used GPU, then the program will crash.
    """
    def __init__(self):
        self.memory_holder = None

    def lock(self, backend):
        # reserve memory for faiss if backend is faiss-gpu
        if backend == 'faiss-gpu':
            print('[INFO]Reserve some memory for FAISS')
            self.memory_holder = reserve_faiss_gpu_memory(gpu_id=0)
        else:
            self.memory_holder = None

    def release(self):
        if self.memory_holder is not None:
            print('[INFO]Release memory for FAISS')
            self.memory_holder = None

#特征预处理，
#PCA + Whitening + L2Normalization
def preprocess_features(x, x2=None, d=256):
    """
    Calculate PCA + Whitening + L2 normalization for each vector
    Args:
        x (ndarray): N x D, where N is number of vectors, D - dimensionality
        x2 (ndarray): optional, if not None apply PCA+Whitening learned on x to x2.
        d (int): number of output dimensions (how many principal components to use).
    Returns:
        transformed [N x d] matrix xt .
    """
    n, orig_d = x.shape
    #PCA降维
    pcaw = faiss.PCAMatrix(d_in=orig_d, d_out=d, eigen_power=-0.5, random_rotation=False)
    pcaw.train(x)
    assert pcaw.is_trained
    print('Performing PCA + whitening')
    x = pcaw.apply_py(x)
    print('x.shape after PCA + whitening:', x.shape)
    l2normalization = faiss.NormalizationTransform(d, 2.0)
    print('Performing L2 normalization')
    x = l2normalization.apply_py(x)
    if x2 is not None:
        print('Perform PCA + whitening for x2')
        x2 = pcaw.apply_py(x2)
        x2 = l2normalization.apply_py(x2)
        return x, x2
    else:
        return x

#Kmeans
def train_kmeans(x, num_clusters=1000, gpu_ids=None, niter=100, nredo=1, verbose=0):
    """
    在单卡或多卡上进行聚类
    Runs k-means clustering on one or several GPUs
    """
    assert np.all(~np.isnan(x)), 'x contains NaN'
    assert np.all(np.isfinite(x)), 'x contains Inf'
    if isinstance(gpu_ids, int):
        gpu_ids = [gpu_ids]
    assert gpu_ids is None or len(gpu_ids)

    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = bool(verbose)
    kmeans.niter = niter
    kmeans.nredo = nredo

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    if gpu_ids is not None:
        res = [faiss.StandardGpuResources() for i in gpu_ids]

        flat_config = []
        for i in gpu_ids:
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if len(gpu_ids) == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                       for i in range(len(gpu_ids))]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    kmeans.train(x, index)
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    #print("[INFO]Final objective: %.4g" % objective[-1])

    return centroids.reshape(num_clusters, d)

#计算聚类标签
def compute_cluster_assignment(centroids, x):
    assert centroids is not None, "should train before assigning"
    d = centroids.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(centroids)
    distances, labels = index.search(x, 1)
    return labels.ravel()


def do_clustering(features, num_clusters, gpu_ids=None,
                  num_pca_components=None, niter=100, nredo=1, verbose=0):
    print('[INFO]FAISS: using GPUs {}'.format(gpu_ids))
    features = np.asarray(features.reshape(features.shape[0], -1), dtype=np.float32)

    if num_pca_components is not None:
        features = preprocess_features(features, d=num_pca_components,
                                       niter=niter, nredo=nredo, verbose=verbose)

    print('[INFO]FAISS: clustering...')
    t0 = time.time()
    centroids = train_kmeans(features, num_clusters, gpu_ids=gpu_ids, verbose=1)
    labels = compute_cluster_assignment(centroids, features)
    t1 = time.time()
     print('[INFO]FAISS: Clustering total elapsed time: %.3f m' % ((t1 - t0) / 60.0))
    return labels


def find_nearest_neighbors(x, queries=None, k=5, gpu_id=None):
    """
    计算KNN
    Find k nearest neighbors for each of the n examples.
    Distances are computed using Squared Euclidean distance metric.
    Arguments:
    ----------
    queries
    x (ndarray): N examples to search within. [N x d].
    gpu_id (int): use CPU if None else use GPU with the specified id.
    queries (ndarray): find nearest neigbor for each query example. [M x d] matrix
        If None than find k nearest neighbors for each row of x
        (excluding self exampels).
    k (int): number of nearest neighbors to find.
    Return
    I (ndarray): Indices of the nearest neighnpors. [M x k]
    distances (ndarray): Distances to the nearest neighbors. [M x k]
    """
    if gpu_id is not None and not isinstance(gpu_id, int):
        raise ValueError('gpu_id must be None or int')
    x = np.asarray(x.reshape(x.shape[0], -1), dtype=np.float32)
    remove_self = False # will have queries in the search results?
    if queries is None:
        remove_self = True
        queries = x
        k += 1

    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        print('[INFO]FAISS: cpu::find {} nearest neighbors'\
                     .format(k - int(remove_self)))
        index = faiss.IndexFlatL2(d)
    else:
        print('[INFO]FAISS: gpu[{}]::find {} nearest neighbors'\
                     .format(gpu_id, k - int(remove_self)))
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    #
    distances, nns = index.search(queries, k)
    if remove_self:
        for i in range(len(nns)):
            indices = np.nonzero(nns[i, :] != i)[0]
            indices.sort()
            if len(indices) > k - 1:
                indices = indices[:-1]
            nns[i, :-1] = nns[i, indices]
            distances[i, :-1] = distances[i, indices]
        nns = nns[:, :-1]
        distances = distances[:, :-1]
    print('[INFO]FAISS: Neighbors search total elapsed time: {:.2f} sec'.format(time.time() - tic))
    return nns, distances


def example(size=30000, k=10, num_pca_components=256):
    gpu_ids = [0]

    x = np.random.rand(size, 512)
    print("[INFO]Reshape")
    x = x.reshape(x.shape[0], -1).astype('float32')
    x, _ = preprocess_features(x, x, d=num_pca_components)

    print("[INFO]Run")
    t0 = time.time()
    centroids = train_kmeans(x, k, gpu_ids=gpu_ids)
    print('[INFO]Compute_cluster_assignment')
    labels = compute_cluster_assignment(centroids, x)
    print('[INFO]centroids.shape:', centroids.shape)
    print('[INFO]labels.type:', labels.__class__, labels.dtype)
    print('[INFO]labels.shape:', labels.shape)
    t1 = time.time()

    print("[INFO]total runtime: %.2f s" % (t1 - t0))


def test_knn_search(size=10000, gpu_id=None):
    x = np.random.rand(size, 512)
    x = x.reshape(x.shape[0], -1).astype('float32')
    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        index = faiss.IndexFlatL2(d)
    else:
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    print('[INFO]Index built in {} sec'.format(time.time() - tic))
    distances, I = index.search(x, 21)
    print('[INFO]Searched in {} sec'.format(time.time() - tic))
    print('[INFO]distance shape', distances.shape)
    print('[INFO]index shape', I.shape)
    print('[INFO]top5 distance', distances[:5])
    print('[INFO]top5 index', I[:5])


if __name__ == '__main__':
    #PCA，Clustering
    example(size=100000, k=3, num_pca_components=32)
    #KNN
    test_knn_search(size=100000, gpu_id=5)

From: faissext.py

Faiss - Kmeans及KNN使用示例

AIHGF • 2021 年 05 月 06 日

from __future__ import print_function
from __future__ import division

import numpy as np
import faiss
import sys
import time
import warnings

#屏蔽一些警告输出
if not sys.warnoptions:
    # suppress pesky PIL EXIF warnings
    warnings.simplefilter("once")
    warnings.filterwarnings("ignore", message="(Possibly )?corrupt EXIF data.*")
    warnings.filterwarnings("ignore", message="numpy.dtype size changed.*")
    warnings.filterwarnings("ignore", message="numpy.ufunc size changed.*")


def reserve_faiss_gpu_memory(gpu_id=0):
    """
    设定 Faiss 所使用的显卡和显存.
    Reserves around 2.4 Gb memory on Titan Xp.
    `r = reserve_faiss_gpu_memory()`
    To release the memory run `del r`
    Something like 200 Mb will still be hold afterwards.
    """
    res = faiss.StandardGpuResources()
    cfg = faiss.GpuIndexFlatConfig()
    cfg.useFloat16 = False
    cfg.device = gpu_id
    index = faiss.GpuIndexFlatL2(res, 2048, cfg)
    return index, res


class MemoryReserver():
    """
    Faiss 显存管理.
    Faiss memory manager. 
    If not used and another process takes up memory of
    currently used GPU, then the program will crash.
    """
    def __init__(self):
        self.memory_holder = None

    def lock(self, backend):
        # reserve memory for faiss if backend is faiss-gpu
        if backend == 'faiss-gpu':
            print('[INFO]Reserve some memory for FAISS')
            self.memory_holder = reserve_faiss_gpu_memory(gpu_id=0)
        else:
            self.memory_holder = None

    def release(self):
        if self.memory_holder is not None:
            print('[INFO]Release memory for FAISS')
            self.memory_holder = None

#特征预处理，
#PCA + Whitening + L2Normalization
def preprocess_features(x, x2=None, d=256):
    """
    Calculate PCA + Whitening + L2 normalization for each vector
    Args:
        x (ndarray): N x D, where N is number of vectors, D - dimensionality
        x2 (ndarray): optional, if not None apply PCA+Whitening learned on x to x2.
        d (int): number of output dimensions (how many principal components to use).
    Returns:
        transformed [N x d] matrix xt .
    """
    n, orig_d = x.shape
    #PCA降维
    pcaw = faiss.PCAMatrix(d_in=orig_d, d_out=d, eigen_power=-0.5, random_rotation=False)
    pcaw.train(x)
    assert pcaw.is_trained
    print('Performing PCA + whitening')
    x = pcaw.apply_py(x)
    print('x.shape after PCA + whitening:', x.shape)
    l2normalization = faiss.NormalizationTransform(d, 2.0)
    print('Performing L2 normalization')
    x = l2normalization.apply_py(x)
    if x2 is not None:
        print('Perform PCA + whitening for x2')
        x2 = pcaw.apply_py(x2)
        x2 = l2normalization.apply_py(x2)
        return x, x2
    else:
        return x

#Kmeans
def train_kmeans(x, num_clusters=1000, gpu_ids=None, niter=100, nredo=1, verbose=0):
    """
    在单卡或多卡上进行聚类
    Runs k-means clustering on one or several GPUs
    """
    assert np.all(~np.isnan(x)), 'x contains NaN'
    assert np.all(np.isfinite(x)), 'x contains Inf'
    if isinstance(gpu_ids, int):
        gpu_ids = [gpu_ids]
    assert gpu_ids is None or len(gpu_ids)

    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = bool(verbose)
    kmeans.niter = niter
    kmeans.nredo = nredo

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    if gpu_ids is not None:
        res = [faiss.StandardGpuResources() for i in gpu_ids]

        flat_config = []
        for i in gpu_ids:
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if len(gpu_ids) == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                       for i in range(len(gpu_ids))]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    kmeans.train(x, index)
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    #print("[INFO]Final objective: %.4g" % objective[-1])

    return centroids.reshape(num_clusters, d)

#计算聚类标签
def compute_cluster_assignment(centroids, x):
    assert centroids is not None, "should train before assigning"
    d = centroids.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(centroids)
    distances, labels = index.search(x, 1)
    return labels.ravel()


def do_clustering(features, num_clusters, gpu_ids=None,
                  num_pca_components=None, niter=100, nredo=1, verbose=0):
    print('[INFO]FAISS: using GPUs {}'.format(gpu_ids))
    features = np.asarray(features.reshape(features.shape[0], -1), dtype=np.float32)

    if num_pca_components is not None:
        features = preprocess_features(features, d=num_pca_components,
                                       niter=niter, nredo=nredo, verbose=verbose)

    print('[INFO]FAISS: clustering...')
    t0 = time.time()
    centroids = train_kmeans(features, num_clusters, gpu_ids=gpu_ids, verbose=1)
    labels = compute_cluster_assignment(centroids, features)
    t1 = time.time()
     print('[INFO]FAISS: Clustering total elapsed time: %.3f m' % ((t1 - t0) / 60.0))
    return labels


def find_nearest_neighbors(x, queries=None, k=5, gpu_id=None):
    """
    计算KNN
    Find k nearest neighbors for each of the n examples.
    Distances are computed using Squared Euclidean distance metric.
    Arguments:
    ----------
    queries
    x (ndarray): N examples to search within. [N x d].
    gpu_id (int): use CPU if None else use GPU with the specified id.
    queries (ndarray): find nearest neigbor for each query example. [M x d] matrix
        If None than find k nearest neighbors for each row of x
        (excluding self exampels).
    k (int): number of nearest neighbors to find.
    Return
    I (ndarray): Indices of the nearest neighnpors. [M x k]
    distances (ndarray): Distances to the nearest neighbors. [M x k]
    """
    if gpu_id is not None and not isinstance(gpu_id, int):
        raise ValueError('gpu_id must be None or int')
    x = np.asarray(x.reshape(x.shape[0], -1), dtype=np.float32)
    remove_self = False # will have queries in the search results?
    if queries is None:
        remove_self = True
        queries = x
        k += 1

    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        print('[INFO]FAISS: cpu::find {} nearest neighbors'\
                     .format(k - int(remove_self)))
        index = faiss.IndexFlatL2(d)
    else:
        print('[INFO]FAISS: gpu[{}]::find {} nearest neighbors'\
                     .format(gpu_id, k - int(remove_self)))
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    #
    distances, nns = index.search(queries, k)
    if remove_self:
        for i in range(len(nns)):
            indices = np.nonzero(nns[i, :] != i)[0]
            indices.sort()
            if len(indices) > k - 1:
                indices = indices[:-1]
            nns[i, :-1] = nns[i, indices]
            distances[i, :-1] = distances[i, indices]
        nns = nns[:, :-1]
        distances = distances[:, :-1]
    print('[INFO]FAISS: Neighbors search total elapsed time: {:.2f} sec'.format(time.time() - tic))
    return nns, distances


def example(size=30000, k=10, num_pca_components=256):
    gpu_ids = [0]

    x = np.random.rand(size, 512)
    print("[INFO]Reshape")
    x = x.reshape(x.shape[0], -1).astype('float32')
    x, _ = preprocess_features(x, x, d=num_pca_components)

    print("[INFO]Run")
    t0 = time.time()
    centroids = train_kmeans(x, k, gpu_ids=gpu_ids)
    print('[INFO]Compute_cluster_assignment')
    labels = compute_cluster_assignment(centroids, x)
    print('[INFO]centroids.shape:', centroids.shape)
    print('[INFO]labels.type:', labels.__class__, labels.dtype)
    print('[INFO]labels.shape:', labels.shape)
    t1 = time.time()

    print("[INFO]total runtime: %.2f s" % (t1 - t0))


def test_knn_search(size=10000, gpu_id=None):
    x = np.random.rand(size, 512)
    x = x.reshape(x.shape[0], -1).astype('float32')
    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        index = faiss.IndexFlatL2(d)
    else:
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    print('[INFO]Index built in {} sec'.format(time.time() - tic))
    distances, I = index.search(x, 21)
    print('[INFO]Searched in {} sec'.format(time.time() - tic))
    print('[INFO]distance shape', distances.shape)
    print('[INFO]index shape', I.shape)
    print('[INFO]top5 distance', distances[:5])
    print('[INFO]top5 index', I[:5])


if __name__ == '__main__':
    #PCA，Clustering
    example(size=100000, k=3, num_pca_components=32)
    #KNN
    test_knn_search(size=100000, gpu_id=5)

From: faissext.py

Faiss - Kmeans及KNN使用示例

※相关文章推荐※

※最新文章推荐※

Leave a Comment Cancel reply

Faiss - Kmeans及KNN使用示例