目标

了解MultiVector Retriever 的使用场景


存储每个文档的多个向量通常是有益的。

LangChain 有一个基础的 MultiVectorRetriever,它可以轻松查询这种类型的设置。大部分复杂性在于如何为每个文档创建多个向量。

创建每个文档的多个向量的方法包括:

  1. 较小的块:将文档分成较小的块,并嵌入这些块(这就是 ParentDocumentRetriever)。
  2. 摘要:为每个文档创建一个摘要,并嵌入该摘要,以及(或代替)文档。
  3. 假设性问题:创建假设性问题,每个文档都适合回答,并嵌入这些问题,以及(或代替)文档。
  4. 手动添加

分割成小块

image-20240627105930681

生成摘要

image-20240627105938216

提问

image-20240627105946306


InMemoryByteStore: 将文件ID映射在内存,找到小块后,根据ID找到文件

文本信息保存在内存:InMemoryByteStore

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# 本地 BGE 模型
bge_en_v1p5_model_path = "D:\\LLM\\Bge_models\\bge-base-en-v1.5"

# 使用GPU
embeddings_model = HuggingFaceEmbeddings(
model_name=bge_en_v1p5_model_path,
model_kwargs={'device': 'cuda:0'},
encode_kwargs={'batch_size': 32, 'normalize_embeddings': True, }
)

vectorstore = Chroma(persist_directory="D:\\LLM\\my_projects\\chroma_db", embedding_function=embeddings_model)

# 解析并载入url
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
byte_store=store,
id_key=id_key,
)

doc_ids = [str(uuid.uuid4()) for _ in docs]

# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

sub_docs = []
for i, doc in enumerate(docs):
_id = doc_ids[i]
_sub_docs = child_text_splitter.split_documents([doc])
for _doc in _sub_docs:
_doc.metadata[id_key] = _id
sub_docs.extend(_sub_docs)

question = "What are the approaches to Task Decomposition?"

retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# 获得小块
doc = retriever.vectorstore.similarity_search("justice breyer")[0]

文本信息保存在ES

小文本块 ← doc_id → 大文本

思路:

索引数据: 先索引小段文本(向量化),再推送文本(不向量化)

小文本数据结构

image-20240627110013819

大文本数据结构

image-20240627110025237

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# 本地 BGE 模型
bge_en_v1p5_model_path = "D:\\LLM\\Bge_models\\bge-base-en-v1.5"

# 使用GPU
embeddings_model = HuggingFaceEmbeddings(
model_name=bge_en_v1p5_model_path,
model_kwargs={'device': 'cuda:0'},
encode_kwargs={'batch_size': 32, 'normalize_embeddings': True, }
)

# Elastic Search
# 向量数据库
vectorstore = ElasticsearchStore(
es_url=os.environ['ELASTIC_HOST_HTTP'],
index_name="index_sd_1024_vectors",
embedding=embeddings_model,
es_user="elastic",
vector_query_field='question_vectors',
es_password=os.environ['ELASTIC_ACCESS_PASSWORD']
)

# 解析并载入url
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# 生成 大文件ID
doc_ids = [str(uuid.uuid4()) for _ in docs]

# 分割成小块
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
id_key = 'doc_id'

for i, doc in enumerate(docs):
_id = doc_ids[i]
_sub_docs = child_text_splitter.split_documents([doc])
for _doc in _sub_docs:
_doc.metadata[id_key] = _id
_doc.metadata['data_type'] = 'small_chunk'

# 将小的文本段向量化推送到ES
vectorstore.add_documents(_sub_docs)

# 将文件整个推送到ES(不做embedding)
tmp_doc = {
'text': doc.page_content,
'metadata': {
'data_type': 'big_chunk',
id_key: _id,
}
}

vectorstore.client.index(index='index_sd_1024_vectors', id=_id, document=tmp_doc)

查询: 先用语义搜索找到小文本,再用小文本的doc_id 找到 大文本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 获得小块
small_chunk_docs = vectorstore.similarity_search('justice breyer')

# 获得大块
# 获得不重复的大文本id
id_key = 'doc_id'
doc_uuid_list = []
for doc in small_chunk_docs:
if doc.metadata[id_key] not in doc_uuid_list:
doc_uuid_list.append(doc.metadata[id_key])
print('找到小文本[{}]'.format(doc.metadata[id_key]))
big_docs= []
for doc_uuid in doc_uuid_list:
res = vectorstore.client.get(index='index_sd_1024_vectors', id=doc_uuid)
big_docs.append(Document(page_content = res.body['_source']['text']))
pass

print('获得大块 [{}]个'.format(len(big_docs)))

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
from uuid import uuid4

from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever, MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain.tools.retriever import create_retriever_tool
from langchain_community.chat_models.azure_openai import AzureChatOpenAI
from langchain_community.document_loaders.web_base import WebBaseLoader

from langchain_community.embeddings import HuggingFaceEmbeddings, QianfanEmbeddingsEndpoint
from langchain_community.retrievers import ElasticSearchBM25Retriever
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid

# 设置一个随机ID
unique_id = uuid4().hex[0:8]

# 设置一个Log 名称
os.environ["LANGCHAIN_PROJECT"] = f" [MultiQueryRetriever] 小段 - {unique_id}"

# 设置 生成Langsmith 轨迹
# os.environ["LANGCHAIN_TRACING_V2"] = 'true'

# 填写你的 API KEY
os.environ["LANGCHAIN_API_KEY"] = os.getenv('MY_LANGCHAIN_API_KEY')

if __name__ == '__main__':

# 本地 BGE 模型
bge_en_v1p5_model_path = "D:\\LLM\\Bge_models\\bge-base-en-v1.5"

# 使用GPU
embeddings_model = HuggingFaceEmbeddings(
model_name=bge_en_v1p5_model_path,
model_kwargs={'device': 'cuda:0'},
encode_kwargs={'batch_size': 32, 'normalize_embeddings': True, }
)

# Elastic Search
# 向量数据库
vectorstore = ElasticsearchStore(
es_url=os.environ['ELASTIC_HOST_HTTP'],
index_name="index_sd_1024_vectors",
embedding=embeddings_model,
es_user="elastic",
vector_query_field='question_vectors',
es_password=os.environ['ELASTIC_ACCESS_PASSWORD']
)

# 解析并载入url
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
docs = text_splitter.split_documents(docs)

# 生成 大文件ID
doc_ids = [str(uuid.uuid4()) for _ in docs]

# 分割成小块
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
id_key = 'doc_id'

for i, doc in enumerate(docs):
_id = doc_ids[i]
_sub_docs = child_text_splitter.split_documents([doc])
for _doc in _sub_docs:
_doc.metadata[id_key] = _id
_doc.metadata['data_type'] = 'small_chunk'

# 将小的文本段向量化推送到ES
vectorstore.add_documents(_sub_docs)

# 将文件整个推送到ES(不做embedding)
tmp_doc = {
'text': doc.page_content,
'metadata': {
'data_type': 'big_chunk',
id_key: _id,
}
}

vectorstore.client.index(index='index_sd_1024_vectors', id=_id, document=tmp_doc)


# 获得小块
small_chunk_docs = vectorstore.similarity_search('justice breyer')

# 获得大块
# 获得不重复的大文本id
id_key = 'doc_id'
doc_uuid_list = []
for doc in small_chunk_docs:
if doc.metadata[id_key] not in doc_uuid_list:
doc_uuid_list.append(doc.metadata[id_key])
print('找到小文本[{}]'.format(doc.metadata[id_key]))
big_docs= []
for doc_uuid in doc_uuid_list:
res = vectorstore.client.get(index='index_sd_1024_vectors', id=doc_uuid)
big_docs.append(Document(page_content = res.body['_source']['text']))
pass

print('获得大块 [{}]个'.format(len(big_docs)))