经验首页 前端设计 程序设计 Java相关 移动开发 数据库/运维 软件/图像 大数据/云计算 其他经验
当前位置:技术经验 » 大数据/云/AI » 人工智能基础 » 查看文章
基于GPT搭建私有知识库聊天机器人(三)向量数据训练
来源:cnblogs  作者:伊力编程  时间:2023/7/14 10:22:53  对本文有异议

在前面的文章中,我们介绍了实现原理和基本环境安装。本文将重点介绍数据训练的流程,以及如何加载、切割、训练数据,并使用向量数据库Milvus进行数据存储。

1. 数据训练依赖于向量数据库

在本文中,我们使用了Milvus作为向量数据库。读者可以参考之前的文章《基于GPT搭建私有知识库聊天机器人(二)环境安装》来准备其他基础环境。

2. 数据训练流程

数据训练的流程包括准备PDF文档、上传至系统文件目录、开始训练、加载文件内容、内容切割和存储至向量数据库。下面是整个流程的流程图:

3. 代码展示

3.1 上传文件至系统文件目录

  1. @app.route('/upload', methods=['GET', 'POST'])
  2. def index():
  3. if request.method == 'POST':
  4. # 获取文本内容
  5. text = request.form.get('name')
  6. # 获取文件内容
  7. file = request.files.get('file')
  8. if file:
  9. # 保存文件到服务器
  10. filename = file.filename
  11. file.save(os.path.join(KNOWLEDGE_FOLDER, text, filename))
  12. file_path = os.path.join(KNOWLEDGE_FOLDER, text, filename)
  13. else:
  14. file_path = None
  15. return jsonify({'message': '上传成功', 'fileServicePath': file_path})
  16. return render_template('index.html')

3.2 加载文件内容

  1. # 映射文件加载
  2. LOADER_MAPPING = {
  3. ".csv": (CSVLoader, {}),
  4. ".docx": (Docx2txtLoader, {}),
  5. ".doc": (UnstructuredWordDocumentLoader, {}),
  6. ".docx": (UnstructuredWordDocumentLoader, {}),
  7. ".enex": (EverNoteLoader, {}),
  8. ".eml": (MyElmLoader, {}),
  9. ".epub": (UnstructuredEPubLoader, {}),
  10. ".html": (UnstructuredHTMLLoader, {}),
  11. ".md": (UnstructuredMarkdownLoader, {}),
  12. ".odt": (UnstructuredODTLoader, {}),
  13. ".pdf": (PDFMinerLoader, {}),
  14. ".ppt": (UnstructuredPowerPointLoader, {}),
  15. ".pptx": (UnstructuredPowerPointLoader, {}),
  16. ".txt": (TextLoader, {"encoding": "utf8"}),
  17. }
  18. def load_single_document(file_path: str) -> List[Document]:
  19. ext = "." + file_path.rsplit(".", 1)[-1]
  20. if ext in LOADER_MAPPING:
  21. loader_class, loader_args = LOADER_MAPPING[ext]
  22. loader = loader_class(file_path, **loader_args)
  23. return loader.load()
  24. raise ValueError(f"文件不存在 '{ext}'")
  25. # 加载文件
  26. def load_documents_knowledge(source_dir: str, secondary_directories: str) -> List[Document]:
  27. """
  28. Loads all documents from the source documents directory, ignoring specified files
  29. """
  30. all_files = []
  31. for ext in LOADER_MAPPING:
  32. all_files.extend(
  33. glob.glob(os.path.join(source_dir, secondary_directories, f"**/*{ext}"), recursive=True)
  34. )
  35. filtered_files = [file_path for file_path in all_files if file_path]
  36. with Pool(processes=os.cpu_count()) as pool:
  37. results = []
  38. with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
  39. for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
  40. results.extend(docs)
  41. pbar.update()
  42. return results

3.3 内容切割

  1. text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  2. texts = text_splitter.split_documents(documents)

3.4 存储至向量数据库

  1. Milvus.from_documents(
  2. texts,
  3. collection_name=collection_name,
  4. embedding=embeddings,
  5. connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
  6. )

3.5 全部代码

  1. #!/usr/bin/env python3
  2. import glob
  3. import os
  4. import shutil
  5. from multiprocessing import Pool
  6. from typing import List
  7. from dotenv import load_dotenv
  8. from langchain.docstore.document import Document
  9. from langchain.document_loaders import (
  10. CSVLoader,
  11. EverNoteLoader,
  12. PDFMinerLoader,
  13. TextLoader,
  14. UnstructuredEmailLoader,
  15. UnstructuredEPubLoader,
  16. UnstructuredHTMLLoader,
  17. UnstructuredMarkdownLoader,
  18. UnstructuredODTLoader,
  19. UnstructuredPowerPointLoader,
  20. UnstructuredWordDocumentLoader, )
  21. from langchain.embeddings import OpenAIEmbeddings
  22. from langchain.text_splitter import RecursiveCharacterTextSplitter
  23. from langchain.vectorstores import Milvus
  24. from tqdm import tqdm
  25. load_dotenv(".env")
  26. MILVUS_HOST = os.environ.get('MILVUS_HOST')
  27. MILVUS_PORT = os.environ.get('MILVUS_PORT')
  28. source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
  29. KNOWLEDGE_FOLDER = os.environ.get('KNOWLEDGE_FOLDER')
  30. KNOWLEDGE_FOLDER_BK = os.environ.get('KNOWLEDGE_FOLDER_BK')
  31. chunk_size = 500
  32. chunk_overlap = 50
  33. # Custom document loaders
  34. class MyElmLoader(UnstructuredEmailLoader):
  35. """在默认值不起作用时回退到文本纯"""
  36. def load(self) -> List[Document]:
  37. """EMl没有 html 使用text/plain"""
  38. try:
  39. try:
  40. doc = UnstructuredEmailLoader.load(self)
  41. except ValueError as e:
  42. if 'text/html content not found in email' in str(e):
  43. # Try plain text
  44. self.unstructured_kwargs["content_source"] = "text/plain"
  45. doc = UnstructuredEmailLoader.load(self)
  46. else:
  47. raise
  48. except Exception as e:
  49. # Add file_path to exception message
  50. raise type(e)(f"{self.file_path}: {e}") from e
  51. return doc
  52. # 映射文件加载
  53. LOADER_MAPPING = {
  54. ".csv": (CSVLoader, {}),
  55. # ".docx": (Docx2txtLoader, {}),
  56. ".doc": (UnstructuredWordDocumentLoader, {}),
  57. ".docx": (UnstructuredWordDocumentLoader, {}),
  58. ".enex": (EverNoteLoader, {}),
  59. ".eml": (MyElmLoader, {}),
  60. ".epub": (UnstructuredEPubLoader, {}),
  61. ".html": (UnstructuredHTMLLoader, {}),
  62. ".md": (UnstructuredMarkdownLoader, {}),
  63. ".odt": (UnstructuredODTLoader, {}),
  64. ".pdf": (PDFMinerLoader, {}),
  65. ".ppt": (UnstructuredPowerPointLoader, {}),
  66. ".pptx": (UnstructuredPowerPointLoader, {}),
  67. ".txt": (TextLoader, {"encoding": "utf8"}),
  68. }
  69. def load_single_document(file_path: str) -> List[Document]:
  70. ext = "." + file_path.rsplit(".", 1)[-1]
  71. if ext in LOADER_MAPPING:
  72. loader_class, loader_args = LOADER_MAPPING[ext]
  73. loader = loader_class(file_path, **loader_args)
  74. return loader.load()
  75. raise ValueError(f"文件不存在 '{ext}'")
  76. def load_documents_knowledge(source_dir: str, secondary_directories: str) -> List[Document]:
  77. """
  78. Loads all documents from the source documents directory, ignoring specified files
  79. """
  80. all_files = []
  81. for ext in LOADER_MAPPING:
  82. all_files.extend(
  83. glob.glob(os.path.join(source_dir, secondary_directories, f"**/*{ext}"), recursive=True)
  84. )
  85. filtered_files = [file_path for file_path in all_files if file_path]
  86. with Pool(processes=os.cpu_count()) as pool:
  87. results = []
  88. with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
  89. for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
  90. results.extend(docs)
  91. pbar.update()
  92. return results
  93. def process_documents_knowledge(secondary_directories: str) -> List[Document]:
  94. """
  95. 加载文档并拆分为块
  96. """
  97. print(f"加载文件目录: {KNOWLEDGE_FOLDER}")
  98. documents = load_documents_knowledge(KNOWLEDGE_FOLDER, secondary_directories)
  99. if not documents:
  100. print("没有文件需要加载")
  101. exit(0)
  102. print(f"加载 {len(documents)} 文件从 {KNOWLEDGE_FOLDER}")
  103. text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  104. texts = text_splitter.split_documents(documents)
  105. print(f"切割 {len(texts)} 文本块 (最大. {chunk_size} tokens 令牌)")
  106. return texts
  107. def main_knowledge(collection_name: str):
  108. # Create embeddings
  109. embeddings = OpenAIEmbeddings()
  110. texts = process_documents_knowledge(collection_name)
  111. Milvus.from_documents(
  112. texts,
  113. collection_name=collection_name,
  114. embedding=embeddings,
  115. connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
  116. )

4. 总结

在本文中,我们详细介绍了基于GPT搭建私有知识库聊天机器人的数据训练过程,包括数据训练的依赖、流程和代码展示。数据训练是搭建聊天机器人的重要步骤,希望本文能对读者有所帮助。在下一篇文章中,我们将介绍如何使用训练好的模型进行聊天机器人的测试和使用。

原文链接:https://www.cnblogs.com/myshare/p/17548434.html

 友情链接:直通硅谷  点职佳  北美留学生论坛

本站QQ群:前端 618073944 | Java 606181507 | Python 626812652 | C/C++ 612253063 | 微信 634508462 | 苹果 692586424 | C#/.net 182808419 | PHP 305140648 | 运维 608723728

W3xue 的所有内容仅供测试,对任何法律问题及风险不承担任何责任。通过使用本站内容随之而来的风险与本站无关。
关于我们  |  意见建议  |  捐助我们  |  报错有奖  |  广告合作、友情链接(目前9元/月)请联系QQ:27243702 沸活量
皖ICP备17017327号-2 皖公网安备34020702000426号