Vector Search
https://medium.com/@vidiptvashist/building-a-vector-database-from-scratch-in-python-6bd683ba5171
关键还是如何做 embedding。以及如何做相似度计算?
后面看一下有哪些常用的embedding model 和算法。
还有,代码中都是简单的字符串,如果特别长,怎么切片呢?这个参数是不是也重要?
from typing import Any
import numpy as np
class VectorStore:
def __init__(self):
self.vector_data: dict[str, np.ndarray] = {} # A dictionary to store vectors
self.vector_index: dict[str, dict] = {} # An indexing structure for retrieval
def add_vector(self, vector_id: str, vector: np.ndarray):
"""
Add a vector to the store.
Args:
vector_id (str or int): A unique identifier for the vector.
vector (numpy.ndarray): The vector data to be stored.
"""
self.vector_data[vector_id] = vector
self._update_index(vector_id, vector)
def get_vector(self, vector_id):
"""
Retrieve a vector from the store.
Args:
vector_id (str or int): The identifier of the vector to retrieve.
Returns:
numpy.ndarray: The vector data if found, or None if not found.
"""
return self.vector_data.get(vector_id)
def _update_index(self, vector_id, vector):
"""
Update the index with the new vector.
Args:
vector_id (str or int): The identifier of the vector.
vector (numpy.ndarray): The vector data.
"""
# In this simple example, we use brute-force cosine similarity for indexing
for existing_id, existing_vector in self.vector_data.items():
similarity = np.dot(vector, existing_vector) / (
np.linalg.norm(vector) * np.linalg.norm(existing_vector)
)
if existing_id not in self.vector_index:
self.vector_index[existing_id] = {}
self.vector_index[existing_id][vector_id] = similarity
def find_similar_vectors(self, query_vector, num_results=5):
"""
Find similar vectors to the query vector using brute-force search.
Args:
query_vector (numpy.ndarray): The query vector for similarity search.
num_results (int): The number of similar vectors to return.
Returns:
list: A list of (vector_id, similarity_score) tuples for the most similar vectors.
"""
results: list[tuple[str, float]] = []
for vector_id, vector in self.vector_data.items():
similarity = np.dot(query_vector, vector) / (
np.linalg.norm(query_vector) * np.linalg.norm(vector)
)
results.append((vector_id, similarity))
# Sort by similarity in descending order
results.sort(key=lambda x: x[1], reverse=True)
# Return the top N results
return results[:num_results]
# Establish a VectorStore instance
vector_store = VectorStore() # Creating an instance of the VectorStore class
# Define sentences
sentences = [ # Defining a list of example sentences
"I eat mango",
"mango is my favorite fruit",
"mango, apple, oranges are fruits",
"fruits are good for health",
]
# Tokenization and Vocabulary Creation
vocabulary: set[str] = set() # Initializing an empty set to store unique words
for sentence in sentences: # Iterating over each sentence in the list
tokens = (
sentence.lower().split()
) # Tokenizing the sentence by splitting on whitespace and converting to lowercase
vocabulary.update(tokens) # Updating the set of vocabulary with unique tokens
# Assign unique indices to vocabulary words
word_to_index = {
word: i for i, word in enumerate(vocabulary)
} # Creating a dictionary mapping words to unique indices
# Vectorization
# Initializing an empty dictionary to store sentence vectors
sentence_vectors: dict[str, np.ndarray] = {}
for sentence in sentences: # Iterating over each sentence in the list
tokens = (
sentence.lower().split()
) # Tokenizing the sentence by splitting on whitespace and converting to lowercase
vector = np.zeros(
len(vocabulary)
) # Initializing a numpy array of zeros for the sentence vector
for token in tokens: # Iterating over each token in the sentence
vector[
word_to_index[token]
] += 1 # Incrementing the count of the token in the vector
sentence_vectors[sentence] = (
vector # Storing the vector for the sentence in the dictionary
)
# Store in VectorStore
for sentence, vector in sentence_vectors.items(): # Iterating over each sentence vector
vector_store.add_vector(
sentence, vector
) # Adding the sentence vector to the VectorStore
# Similarity Search
query_sentence = "Mango is the best fruit" # Defining a query sentence
query_vector = np.zeros(
len(vocabulary)
) # Initializing a numpy array of zeros for the query vector
query_tokens = (
query_sentence.lower().split()
) # Tokenizing the query sentence and converting to lowercase
for token in query_tokens: # Iterating over each token in the query sentence
if token in word_to_index: # Checking if the token is present in the vocabulary
query_vector[
word_to_index[token]
] += 1 # Incrementing the count of the token in the query vector
similar_sentences = vector_store.find_similar_vectors(
query_vector, num_results=2
) # Finding similar sentences
# Display similar sentences
print("Query Sentence:", query_sentence) # Printing the query sentence
print("Similar Sentences:") # Printing the header for similar sentences
for (
sentence,
similarity,
) in similar_sentences: # Iterating over each similar sentence and its similarity score
print(
f"{sentence}: Similarity = {similarity:.4f}"
) # Printing the similar sentence and its similarity score