LangChain的embeddings结合向量数据库Weaviate接入ChatGPT-4、智普和通义千问
注意:
1.如果要使用nltk需要翻墙,如果只是使用中文分词,不需要翻墙。
2.不知道怎么安装和部署向量数据库Weaviate,可以看安装和部署向量数据库Weaviate
3.不知道怎么使用向量数据库weaviate,可以参考向量数据库weaviate,Python Client v4一些简单使用
4.不知道怎么写中文分词器,可以参考中文分词器jieba结合LangChain
import re
from typing import Sequence, List, Unionimport chardet
import nltk
import weaviate
from langchain.docstore.document import Document
from langchain.text_splitter import NLTKTextSplitter
from langchain.vectorstores import VectorStore
from langchain_community.document_loaders import (PyPDFLoader, Docx2txtLoader, UnstructuredMarkdownLoader, UnstructuredExcelLoader
)
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import ZhipuAIEmbeddings
from langchain_community.embeddings import DashScopeEmbeddings
from langdetect import detect
from weaviate import WeaviateClient
from weaviate.auth import AuthApiKey
from weaviate.collections.classes.grpc import MetadataQueryfrom diaggpt.textsplitter.jieba_text_splitter import JiebaTextSplitternltk.download('punkt')_default_text_splitter = NLTKTextSplitter.from_tiktoken_encoder(separator="\n\n",chunk_size=500,chunk_overlap=100,
)def get_vectorstore(client: WeaviateClient, collection_name: str, makers: str):if not check_index_exists(client, collection_name):create_collection(client, collection_name)else:print(f"索引 '{collection_name}' 早已存在.")embeddings = get_embeddings(makers)return WeaviateVectorStore(client, embeddings, collection_name)def check_index_exists(client: WeaviateClient, collection_name: str) -> bool:"""检查索引是否存在:param client: 连接:param collection_name: 索引名:return: True或者False"""try:collections = client.collections.list_all()# 检查 collection_name 是否存在于集合列表中collection_names = [c for c in collections]return collection_name in collection_namesexcept Exception as e:print(f"检查索引异常: {e}")return Falsedef get_embeddings(makers: str):if makers == "OpenAI":return OpenAIEmbeddings()elif makers == "ZhipuAI":return ZhipuAIEmbeddings()elif makers == "QianWen":return DashScopeEmbeddings()else:raise ValueError(f"不知道的厂商类型: {makers}")def create_collection(client: WeaviateClient, collection_name: str):collection_obj = {"class": collection_name,"description": "A collection for product information","invertedIndexConfig": {"bm25": {"b": 0.75,"k1": 1.2<