东莞最好的网站,wordpress后台编辑主题时提示:抱歉_该文件无法被编辑,个人建设电影网站备案,在哪个网站可以学做衣服提示#xff1a; 想要了解更多有关内置文档加载器与第三方工具集成的文档#xff0c;甚至包括了#xff1a;哔哩哔哩网站加载器、区块链加载器、汇编音频文本、Datadog日志加载器等。 本文主要收集与讲解日常使用的加载器#xff0c;足够咱们平时开发人工智能的工作使用 想要了解更多有关内置文档加载器与第三方工具集成的文档甚至包括了哔哩哔哩网站加载器、区块链加载器、汇编音频文本、Datadog日志加载器等。 本文主要收集与讲解日常使用的加载器足够咱们平时开发人工智能的工作使用大概有csv加载器、text加载器、word加载器、html加载器、pdf加载器、文件目录加载器、json加载器等。
概述 使用文档加载器将数据从源加载为 Document Document是一段文本和相关的元数据。例如有一些文档加载器用于加载简单的 .txt 文件用于加载任何网页的文本内容甚至用于加载 YouTube视频的副本。 文档加载器提供了一种“加载”方法用于从配置的源中将数据作为文档加载。它们还可选地实现“延迟加载”用于将数据延迟加载到内存中。 一、CSV 加载器 CSV 文件是使用逗号分隔值的分隔文本文件。文件的每一行都是一条数据记录。每个记录由一个或多个用逗号分隔的字段组成。 每个文档加载一行CSV数据。
from langchain_community.document_loaders.csv_loader import CSVLoaderloader CSVLoader(file_path./example_data/mlb_teams_2012.csv)
data loader.load()print(data) 打印结果 [Document(page_contentTeam: Nationals\nPayroll (millions): 81.34\nWins: 98, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 0}, lookup_index0), Document(page_contentTeam: Reds\nPayroll (millions): 82.20\nWins: 97, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 1}, lookup_index0), Document(page_contentTeam: Yankees\nPayroll (millions): 197.96\nWins: 95, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 2}, lookup_index0), Document(page_contentTeam: Giants\nPayroll (millions): 117.62\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 3}, lookup_index0), Document(page_contentTeam: Braves\nPayroll (millions): 83.31\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 4}, lookup_index0), Document(page_contentTeam: Athletics\nPayroll (millions): 55.37\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 5}, lookup_index0), Document(page_contentTeam: Rangers\nPayroll (millions): 120.51\nWins: 93, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 6}, lookup_index0), Document(page_contentTeam: Orioles\nPayroll (millions): 81.43\nWins: 93, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 7}, lookup_index0), Document(page_contentTeam: Rays\nPayroll (millions): 64.17\nWins: 90, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 8}, lookup_index0), Document(page_contentTeam: Angels\nPayroll (millions): 154.49\nWins: 89, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 9}, lookup_index0), Document(page_contentTeam: Tigers\nPayroll (millions): 132.30\nWins: 88, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 10}, lookup_index0), Document(page_contentTeam: Cardinals\nPayroll (millions): 110.30\nWins: 88, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 11}, lookup_index0), Document(page_contentTeam: Dodgers\nPayroll (millions): 95.14\nWins: 86, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 12}, lookup_index0), Document(page_contentTeam: White Sox\nPayroll (millions): 96.92\nWins: 85, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 13}, lookup_index0), Document(page_contentTeam: Brewers\nPayroll (millions): 97.65\nWins: 83, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 14}, lookup_index0), Document(page_contentTeam: Phillies\nPayroll (millions): 174.54\nWins: 81, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 15}, lookup_index0), Document(page_contentTeam: Diamondbacks\nPayroll (millions): 74.28\nWins: 81, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 16}, lookup_index0), Document(page_contentTeam: Pirates\nPayroll (millions): 63.43\nWins: 79, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 17}, lookup_index0), Document(page_contentTeam: Padres\nPayroll (millions): 55.24\nWins: 76, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 18}, lookup_index0), Document(page_contentTeam: Mariners\nPayroll (millions): 81.97\nWins: 75, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 19}, lookup_index0), Document(page_contentTeam: Mets\nPayroll (millions): 93.35\nWins: 74, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 20}, lookup_index0), Document(page_contentTeam: Blue Jays\nPayroll (millions): 75.48\nWins: 73, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 21}, lookup_index0), Document(page_contentTeam: Royals\nPayroll (millions): 60.91\nWins: 72, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 22}, lookup_index0), Document(page_contentTeam: Marlins\nPayroll (millions): 118.07\nWins: 69, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 23}, lookup_index0), Document(page_contentTeam: Red Sox\nPayroll (millions): 173.18\nWins: 69, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 24}, lookup_index0), Document(page_contentTeam: Indians\nPayroll (millions): 78.43\nWins: 68, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 25}, lookup_index0), Document(page_contentTeam: Twins\nPayroll (millions): 94.08\nWins: 66, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 26}, lookup_index0), Document(page_contentTeam: Rockies\nPayroll (millions): 78.06\nWins: 64, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 27}, lookup_index0), Document(page_contentTeam: Cubs\nPayroll (millions): 88.19\nWins: 61, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 28}, lookup_index0), Document(page_contentTeam: Astros\nPayroll (millions): 60.65\nWins: 55, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 29}, lookup_index0)]
① 定制CSV解析和加载 参见csv模块文档了解支持哪些csv参数的更多信息。 下面直接在注释里面讲解几个比较常用的参数
loader CSVLoader(file_path./example_data/mlb_teams_2012.csv, csv_args{# 定界符用于分隔字段的单字符字符串。它默认为,delimiter: ,,# 引号字符用于引用包含特殊字符的字段的单字符字符串如定界符或者quotechar或者包含换行符。它默认为.quotechar: ,# 字段名称如果在创建对象时没有作为参数传递则在第一次访问或从文件中读取第一条记录时初始化该属性。fieldnames: [MLB Team, Payroll in millions, Wins]
})data loader.load()print(data) 打印结果 [Document(page_contentMLB Team: Team\nPayroll in millions: Payroll (millions)\nWins: Wins, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 0}, lookup_index0), Document(page_contentMLB Team: Nationals\nPayroll in millions: 81.34\nWins: 98, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 1}, lookup_index0), Document(page_contentMLB Team: Reds\nPayroll in millions: 82.20\nWins: 97, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 2}, lookup_index0), Document(page_contentMLB Team: Yankees\nPayroll in millions: 197.96\nWins: 95, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 3}, lookup_index0), Document(page_contentMLB Team: Giants\nPayroll in millions: 117.62\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 4}, lookup_index0), Document(page_contentMLB Team: Braves\nPayroll in millions: 83.31\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 5}, lookup_index0), Document(page_contentMLB Team: Athletics\nPayroll in millions: 55.37\nWins: 94, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 6}, lookup_index0), Document(page_contentMLB Team: Rangers\nPayroll in millions: 120.51\nWins: 93, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 7}, lookup_index0), Document(page_contentMLB Team: Orioles\nPayroll in millions: 81.43\nWins: 93, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 8}, lookup_index0), Document(page_contentMLB Team: Rays\nPayroll in millions: 64.17\nWins: 90, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 9}, lookup_index0), Document(page_contentMLB Team: Angels\nPayroll in millions: 154.49\nWins: 89, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 10}, lookup_index0), Document(page_contentMLB Team: Tigers\nPayroll in millions: 132.30\nWins: 88, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 11}, lookup_index0), Document(page_contentMLB Team: Cardinals\nPayroll in millions: 110.30\nWins: 88, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 12}, lookup_index0), Document(page_contentMLB Team: Dodgers\nPayroll in millions: 95.14\nWins: 86, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 13}, lookup_index0), Document(page_contentMLB Team: White Sox\nPayroll in millions: 96.92\nWins: 85, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 14}, lookup_index0), Document(page_contentMLB Team: Brewers\nPayroll in millions: 97.65\nWins: 83, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 15}, lookup_index0), Document(page_contentMLB Team: Phillies\nPayroll in millions: 174.54\nWins: 81, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 16}, lookup_index0), Document(page_contentMLB Team: Diamondbacks\nPayroll in millions: 74.28\nWins: 81, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 17}, lookup_index0), Document(page_contentMLB Team: Pirates\nPayroll in millions: 63.43\nWins: 79, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 18}, lookup_index0), Document(page_contentMLB Team: Padres\nPayroll in millions: 55.24\nWins: 76, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 19}, lookup_index0), Document(page_contentMLB Team: Mariners\nPayroll in millions: 81.97\nWins: 75, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 20}, lookup_index0), Document(page_contentMLB Team: Mets\nPayroll in millions: 93.35\nWins: 74, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 21}, lookup_index0), Document(page_contentMLB Team: Blue Jays\nPayroll in millions: 75.48\nWins: 73, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 22}, lookup_index0), Document(page_contentMLB Team: Royals\nPayroll in millions: 60.91\nWins: 72, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 23}, lookup_index0), Document(page_contentMLB Team: Marlins\nPayroll in millions: 118.07\nWins: 69, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 24}, lookup_index0), Document(page_contentMLB Team: Red Sox\nPayroll in millions: 173.18\nWins: 69, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 25}, lookup_index0), Document(page_contentMLB Team: Indians\nPayroll in millions: 78.43\nWins: 68, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 26}, lookup_index0), Document(page_contentMLB Team: Twins\nPayroll in millions: 94.08\nWins: 66, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 27}, lookup_index0), Document(page_contentMLB Team: Rockies\nPayroll in millions: 78.06\nWins: 64, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 28}, lookup_index0), Document(page_contentMLB Team: Cubs\nPayroll in millions: 88.19\nWins: 61, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 29}, lookup_index0), Document(page_contentMLB Team: Astros\nPayroll in millions: 60.65\nWins: 55, lookup_str, metadata{source: ./example_data/mlb_teams_2012.csv, row: 30}, lookup_index0)]
② 指定一个列来标识文档源 使用 source_column 参数为从每一行创建的文档指定一个源。否则 file_path 将用作从CSV文件创建的所有文档的源。 当将从CSV文件加载的文档用于使用源回答问题的链时这很有用。
loader CSVLoader(file_path./example_data/mlb_teams_2012.csv, source_columnTeam)data loader.load()print(data) [Document(page_contentTeam: Nationals\nPayroll (millions): 81.34\nWins: 98, lookup_str, metadata{source: Nationals, row: 0}, lookup_index0), Document(page_contentTeam: Reds\nPayroll (millions): 82.20\nWins: 97, lookup_str, metadata{source: Reds, row: 1}, lookup_index0), Document(page_contentTeam: Yankees\nPayroll (millions): 197.96\nWins: 95, lookup_str, metadata{source: Yankees, row: 2}, lookup_index0), Document(page_contentTeam: Giants\nPayroll (millions): 117.62\nWins: 94, lookup_str, metadata{source: Giants, row: 3}, lookup_index0), Document(page_contentTeam: Braves\nPayroll (millions): 83.31\nWins: 94, lookup_str, metadata{source: Braves, row: 4}, lookup_index0), Document(page_contentTeam: Athletics\nPayroll (millions): 55.37\nWins: 94, lookup_str, metadata{source: Athletics, row: 5}, lookup_index0), Document(page_contentTeam: Rangers\nPayroll (millions): 120.51\nWins: 93, lookup_str, metadata{source: Rangers, row: 6}, lookup_index0), Document(page_contentTeam: Orioles\nPayroll (millions): 81.43\nWins: 93, lookup_str, metadata{source: Orioles, row: 7}, lookup_index0), Document(page_contentTeam: Rays\nPayroll (millions): 64.17\nWins: 90, lookup_str, metadata{source: Rays, row: 8}, lookup_index0), Document(page_contentTeam: Angels\nPayroll (millions): 154.49\nWins: 89, lookup_str, metadata{source: Angels, row: 9}, lookup_index0), Document(page_contentTeam: Tigers\nPayroll (millions): 132.30\nWins: 88, lookup_str, metadata{source: Tigers, row: 10}, lookup_index0), Document(page_contentTeam: Cardinals\nPayroll (millions): 110.30\nWins: 88, lookup_str, metadata{source: Cardinals, row: 11}, lookup_index0), Document(page_contentTeam: Dodgers\nPayroll (millions): 95.14\nWins: 86, lookup_str, metadata{source: Dodgers, row: 12}, lookup_index0), Document(page_contentTeam: White Sox\nPayroll (millions): 96.92\nWins: 85, lookup_str, metadata{source: White Sox, row: 13}, lookup_index0), Document(page_contentTeam: Brewers\nPayroll (millions): 97.65\nWins: 83, lookup_str, metadata{source: Brewers, row: 14}, lookup_index0), Document(page_contentTeam: Phillies\nPayroll (millions): 174.54\nWins: 81, lookup_str, metadata{source: Phillies, row: 15}, lookup_index0), Document(page_contentTeam: Diamondbacks\nPayroll (millions): 74.28\nWins: 81, lookup_str, metadata{source: Diamondbacks, row: 16}, lookup_index0), Document(page_contentTeam: Pirates\nPayroll (millions): 63.43\nWins: 79, lookup_str, metadata{source: Pirates, row: 17}, lookup_index0), Document(page_contentTeam: Padres\nPayroll (millions): 55.24\nWins: 76, lookup_str, metadata{source: Padres, row: 18}, lookup_index0), Document(page_contentTeam: Mariners\nPayroll (millions): 81.97\nWins: 75, lookup_str, metadata{source: Mariners, row: 19}, lookup_index0), Document(page_contentTeam: Mets\nPayroll (millions): 93.35\nWins: 74, lookup_str, metadata{source: Mets, row: 20}, lookup_index0), Document(page_contentTeam: Blue Jays\nPayroll (millions): 75.48\nWins: 73, lookup_str, metadata{source: Blue Jays, row: 21}, lookup_index0), Document(page_contentTeam: Royals\nPayroll (millions): 60.91\nWins: 72, lookup_str, metadata{source: Royals, row: 22}, lookup_index0), Document(page_contentTeam: Marlins\nPayroll (millions): 118.07\nWins: 69, lookup_str, metadata{source: Marlins, row: 23}, lookup_index0), Document(page_contentTeam: Red Sox\nPayroll (millions): 173.18\nWins: 69, lookup_str, metadata{source: Red Sox, row: 24}, lookup_index0), Document(page_contentTeam: Indians\nPayroll (millions): 78.43\nWins: 68, lookup_str, metadata{source: Indians, row: 25}, lookup_index0), Document(page_contentTeam: Twins\nPayroll (millions): 94.08\nWins: 66, lookup_str, metadata{source: Twins, row: 26}, lookup_index0), Document(page_contentTeam: Rockies\nPayroll (millions): 78.06\nWins: 64, lookup_str, metadata{source: Rockies, row: 27}, lookup_index0), Document(page_contentTeam: Cubs\nPayroll (millions): 88.19\nWins: 61, lookup_str, metadata{source: Cubs, row: 28}, lookup_index0), Document(page_contentTeam: Astros\nPayroll (millions): 60.65\nWins: 55, lookup_str, metadata{source: Astros, row: 29}, lookup_index0)]
二、文件目录 File Directory 加载器 这包括如何加载目录中的所有文档。 默认情况下它使用非结构化加载程序.
from langchain_community.document_loaders import DirectoryLoader 我们可以使用 glob 参数来控制要加载的文件。请注意这里它不加载 .rst 文件或 .html 文件。
loader DirectoryLoader(../, glob**/*.md)docs loader.load()print(len(docs))打印结果 1
① 显示进度条 默认情况下不会显示进度条。要显示进度条请安装 tqdm library例如并设置show_progress 参数到 True .
pip install tqdm
loader DirectoryLoader(../, glob**/*.md, show_progressTrue)
docs loader.load() 演示效果 Requirement already satisfied: tqdm in /Users/jon/.pyenv/versions/3.9.16/envs/microbiome-app/lib/python3.9/site-packages (4.65.0)0it [00:00, ?it/s]
② 使用多线程 默认情况下加载发生在一个线程中。为了利用几个线程请将 use_multithreading 标志为 true。
loader DirectoryLoader(../, glob**/*.md, use_multithreadingTrue)
docs loader.load() ③ 更改加载程序类 默认情况下使用 UnstructuredLoader 类。然而您可以非常容易地改变加载程序的类型。只需要指定参数 loader_cls 的类型。
from langchain_community.document_loaders import TextLoaderloader DirectoryLoader(../, glob**/*.md, loader_clsTextLoader)docs loader.load()len(docs)打印结果 1 如果需要加载 Python源代码文件请使用 PythonLoader .
from langchain_community.document_loaders import PythonLoaderloader DirectoryLoader(../../../../../, glob**/*.py, loader_clsPythonLoader)docs loader.load()len(docs) 打印结果 691
④ 使用文本加载器自动检测文件编码 在本例中我们将看到一些有用的策略特别是这些策略在加载大量随机文件时使用 TextLoader 类。 首先为了说明这个问题让我们尝试用任意编码加载多个文本。
path ../../../../../tests/integration_tests/examples
loader DirectoryLoader(path, glob**/*.txt, loader_clsTextLoader) A.默认行为
loader.load() 文件 example-non-utf8.txt 使用不同的编码因此load()函数失败并显示一条有用的消息指出哪个文件解码失败。 默认行为TextLoader加载任何文档失败都将导致整个加载过程失败并且不会加载任何文档。 B.无声失败 我们可以传递参数silent_errors到DirectoryLoader跳过无法加载的文件并继续加载过程。
loader DirectoryLoader(path, glob**/*.txt, loader_clsTextLoader, silent_errorsTrue)
docs loader.load() C.自动检测编码 我们也可以使用 TextLoader 自动检测文件编码失败前通过autodetect_encoding加载相关的加载器类。
text_loader_kwargs{autodetect_encoding: True}
loader DirectoryLoader(path, glob**/*.txt, loader_clsTextLoader, loader_kwargstext_loader_kwargs)
docs loader.load()doc_sources [doc.metadata[source] for doc in docs]
print(doc_sources) 打印结果 [../../../../../tests/integration_tests/examples/example-non-utf8.txt,../../../../../tests/integration_tests/examples/whatsapp_chat.txt,../../../../../tests/integration_tests/examples/example-utf8.txt] 三、HTML 加载器 超文本标记语言或HTML是设计用于在web浏览器中显示的文档的标准标记语言。 这包括如何加载 HTML文档 转换成我们可以在下游使用的文档格式。
from langchain_community.document_loaders import UnstructuredHTMLLoaderloader UnstructuredHTMLLoader(example_data/fake-content.html)data loader.load()print(data) 打印结果 [Document(page_contentMy First Heading\n\nMy first paragraph., lookup_str, metadata{source: example_data/fake-content.html}, lookup_index0)]
① 用BeautifulSoup4加载HTML 我们也可以使用 BeautifulSoup4 使用加载HTML文档 BSHTMLLoader 。这将把文本从HTML提取到page_content页面标题为title到…里面metadata.
from langchain_community.document_loaders import BSHTMLLoader
loader BSHTMLLoader(example_data/fake-content.html)
data loader.load()
print(data) [Document(page_content\n\nTest Title\n\n\nMy First Heading\nMy first paragraph.\n\n\n, metadata{source: example_data/fake-content.html, title: Test Title})] 四、JSON 加载器 JSON 是一种开放的标准文件格式和数据交换格式它使用人类可读的文本来存储和传输由属性值对和数组或其他可序列化的值组成的数据对象。 JSON行是一种文件格式其中每一行都是有效的JSON值。 JSONLoader使用指定的jq模式解析JSON文件。它使用jq python包。详情看这个指南的详细文档 jq 语法。 pip install jq
from langchain_community.document_loaders import JSONLoader
import json
from pathlib import Path
from pprint import pprintfile_path./example_data/facebook_chat.json
data json.loads(Path(file_path).read_text())print(data) {image: {creation_timestamp: 1675549016, uri: image_of_the_chat.jpg},is_still_participant: True,joinable_mode: {link: , mode: 1},magic_words: [],messages: [{content: Bye!,sender_name: User 2,timestamp_ms: 1675597571851},{content: Oh no worries! Bye,sender_name: User 1,timestamp_ms: 1675597435669},{content: No Im sorry it was my mistake, the blue one is not for sale,sender_name: User 2,timestamp_ms: 1675596277579},{content: I thought you were selling the blue one!,sender_name: User 1,timestamp_ms: 1675595140251},{content: Im not interested in this bag. Im interested in the blue one!,sender_name: User 1,timestamp_ms: 1675595109305},{content: Here is $129,sender_name: User 2,timestamp_ms: 1675595068468},{photos: [{creation_timestamp: 1675595059,uri: url_of_some_picture.jpg}],sender_name: User 2,timestamp_ms: 1675595060730},{content: Online is at least $100,sender_name: User 2,timestamp_ms: 1675595045152},{content: How much do you want?,sender_name: User 1,timestamp_ms: 1675594799696},{content: Goodmorning! $50 is too low.,sender_name: User 2,timestamp_ms: 1675577876645},{content: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!,sender_name: User 1,timestamp_ms: 1675549022673}],participants: [{name: User 1}, {name: User 2}],thread_path: inbox/User 1 and User 2 chat,title: User 1 and User 2 chat}
① 使用JSONLoader 假设我们对提取content中的字段messagesJSON数据的键。这可以通过JSONLoader如下图。
JSON文件
loader JSONLoader(file_path./example_data/facebook_chat.json,jq_schema.messages[].content,text_contentFalse)data loader.load()print(data) [Document(page_contentBye!, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 1}),Document(page_contentOh no worries! Bye, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 2}),Document(page_contentNo Im sorry it was my mistake, the blue one is not for sale, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 3}),Document(page_contentI thought you were selling the blue one!, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 4}),Document(page_contentIm not interested in this bag. Im interested in the blue one!, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 5}),Document(page_contentHere is $129, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 6}),Document(page_content, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 7}),Document(page_contentOnline is at least $100, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 8}),Document(page_contentHow much do you want?, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 9}),Document(page_contentGoodmorning! $50 is too low., metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 10}),Document(page_contentHi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!, metadata{source: /Users/avsolatorio/WBG/langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat.json, seq_num: 11})]
JSON行文件 如果您想从JSON Lines文件中加载文档您需要传递json_linesTrue并详细说明jq_schema提取page_content来自单个JSON对象。
file_path ./example_data/facebook_chat_messages.jsonl
print(Path(file_path).read_text()) ({sender_name: User 2, timestamp_ms: 1675597571851, content: Bye!}\n{sender_name: User 1, timestamp_ms: 1675597435669, content: Oh no worries! Bye}\n{sender_name: User 2, timestamp_ms: 1675596277579, content: No Im sorry it was my mistake, the blue one is not for sale}\n)
loader JSONLoader(file_path./example_data/facebook_chat_messages.jsonl,jq_schema.content,text_contentFalse,json_linesTrue)data loader.load()print(data) [Document(page_contentBye!, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 1}),Document(page_contentOh no worries! Bye, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 2}),Document(page_contentNo Im sorry it was my mistake, the blue one is not for sale, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 3})]
设置了另一个选项jq_schema.并提供content_key
loader JSONLoader(file_path./example_data/facebook_chat_messages.jsonl,jq_schema.,content_keysender_name,json_linesTrue)data loader.load()print(data) [Document(page_contentUser 2, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 1}),Document(page_contentUser 1, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 2}),Document(page_contentUser 2, metadata{source: langchain/docs/modules/indexes/document_loaders/examples/example_data/facebook_chat_messages.jsonl, seq_num: 3})]
带有jq模式的JSON文件content_key 要使用jq模式中的content_key从JSON文件加载文档请设置is _ content _ key _ jq _ pars able True。请确保content_key是兼容的并且可以使用jq模式进行解析。
file_path ./sample.json
pprint(Path(file_path).read_text()) {data: [{attributes: {message: message1,tags: [tag1]},id: 1},{attributes: {message: message2,tags: [tag2]},id: 2}]}
loader JSONLoader(file_pathfile_path,jq_schema.data[],content_key.attributes.message,is_content_key_jq_parsableTrue,
)data loader.load()print(data) [Document(page_contentmessage1, metadata{source: /path/to/sample.json, seq_num: 1}),Document(page_contentmessage2, metadata{source: /path/to/sample.json, seq_num: 2})]
五、PDF Loader 加载器
目前市面上有很多的pdf加载器下面会挑选几款受欢迎的展示具体要使用哪种自行选择。
① 使用 PyPDF PyPDF是一个功能全面的库它允许用户进行PDF的读取、分割、合并以及转换等操作。这个库的优点在于其轻量且纯Python编写没有庞大的依赖因此安装和使用相对简单。此外PyPDF跨平台性好能够在Windows、macOS和Linux上良好运行。然而它可能不支持PDF 1.7及以上版本的某些特性对于处理带有复杂特性的最新PDF文件可能会存在限制。
加载PDF使用pypdf文档数组其中每个文档都包含页面内容和元数据page号码。
pip install pypdf
from langchain_community.document_loaders import PyPDFLoaderloader PyPDFLoader(example_data/layout-parser-paper.pdf)
pages loader.load_and_split()
print(pages[0]) Document(page_contentLayoutParser : A Uni\x0ced Toolkit for Deep\nLearning Based Document Image Analysis\nZejiang Shen1( \x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\nLee4, Jacob Carlson3, and Weining Li5\n1Allen Institute for AI\nshannonsallenai.org\n2Brown University\nruochen zhangbrown.edu\n3Harvard University\nfmelissadell,jacob carlson gfas.harvard.edu\n4University of Washington\nbcglcs.washington.edu\n5University of Waterloo\nw422liuwaterloo.ca\nAbstract. Recent advances in document image analysis (DIA) have been\nprimarily driven by the application of neural networks. Ideally, research\noutcomes could be easily deployed in production and extended for further\ninvestigation. However, various factors like loosely organized codebases\nand sophisticated model con\x0cgurations complicate the easy reuse of im-\nportant innovations by a wide audience. Though there have been on-going\ne\x0borts to improve reusability and simplify deep learning (DL) model\ndevelopment in disciplines like natural language processing and computer\nvision, none of them are optimized for challenges in the domain of DIA.\nThis represents a major gap in the existing toolkit, as DIA is central to\nacademic research across a wide range of disciplines in the social sciences\nand humanities. This paper introduces LayoutParser , an open-source\nlibrary for streamlining the usage of DL in DIA research and applica-\ntions. The core LayoutParser library comes with a set of simple and\nintuitive interfaces for applying and customizing DL models for layout de-\ntection, character recognition, and many other document processing tasks.\nTo promote extensibility, LayoutParser also incorporates a community\nplatform for sharing both pre-trained models and full document digiti-\nzation pipelines. We demonstrate that LayoutParser is helpful for both\nlightweight and large-scale digitization pipelines in real-word use cases.\nThe library is publicly available at https://layout-parser.github.io .\nKeywords: Document Image Analysis ·Deep Learning ·Layout Analysis\n·Character Recognition ·Open Source library ·Toolkit.\n1 Introduction\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\ndocument image analysis (DIA) tasks including document image classi\x0ccation [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021, metadata{source: example_data/layout-parser-paper.pdf, page: 0})
这种方法的一个优点是可以通过页码检索文档。
应用实例
import os
import getpassos.environ[OPENAI_API_KEY] getpass.getpass(OpenAI API Key:)from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddingsfaiss_index FAISS.from_documents(pages, OpenAIEmbeddings())
docs faiss_index.similarity_search(How will the community be engaged?, k2)
for doc in docs:print(str(doc.metadata[page]) :, doc.page_content[:300]) 9: 10 Z. Shen et al.Fig. 4: Illustration of (a) the original historical Japanese document with layoutdetection results and (b) a recreated version of the document image that achievesmuch better character recognition recall. The reorganization algorithm rearrangesthe tokens based on the their detect3: 4 Z. Shen et al.Efficient Data AnnotationC u s t o m i z e d M o d e l T r a i n i n gModel Cust omizationDI A Model HubDI A Pipeline SharingCommunity PlatformLa y out Detection ModelsDocument ImagesT h e C o r e L a y o u t P a r s e r L i b r a r yOCR ModuleSt or age VisualizationLa y ou
② 使用PyPDFium2 PyPDFium2是基于PDFium的Python绑定。PDFium是一个由Google开发的快速且功能丰富的PDF渲染引擎因此PyPDFium2在PDF渲染和文本提取方面可能具有出色的性能。对于那些主要关注PDF的渲染和文本提取的用户来说PyPDFium2可能是一个理想的选择。
from langchain_community.document_loaders import PyPDFium2Loaderloader PyPDFium2Loader(example_data/layout-parser-paper.pdf)data loader.load()
③ 使用PDFMiner PDFMiner是一个专注于从PDF文档中提取文本和元数据的库。它不仅支持基本的文本提取功能还提供了许多高级特性如表格分析和图像处理。PDFMiner的API设计简洁易懂方便开发者快速上手并且具有良好的跨平台兼容性。无论是简单的文本提取还是复杂的页面布局分析PDFMiner都能满足各种需求。
from langchain_community.document_loaders import PDFMinerLoaderloader PDFMinerLoader(example_data/layout-parser-paper.pdf)data loader.load()
④ 特殊的 PyPDF Directory 从目录加载pdf
from langchain_community.document_loaders import PyPDFDirectoryLoaderloader PyPDFDirectoryLoader(example_data/)docs loader.load()
⑤ 特殊的 使用非结构化 非结构化的PDF指的是PDF文件中的信息没有按照一定的结构或格式进行组织而是以原始的、未加工的形式呈现。这类PDF文件中的数据没有预定义的数据模型不方便用数据库二维逻辑表来表现也不便于提取和解析。因此非结构化的PDF文件可能看起来比较杂乱缺乏统一的结构和格式使得用户难以直接获取所需的信息。
from langchain_community.document_loaders import UnstructuredPDFLoaderloader UnstructuredPDFLoader(example_data/layout-parser-paper.pdf)data loader.load()
六、Word 加载器含.doc 和 .docx
在langchain里面word只有一个非结构化的word加载器UnstructuredWordDocumentLoader。
环境准备
pip install unstructuredpip install python-docpip install python-docx 示例代码
from langchain_community.document_loaders import UnstructuredWordDocumentLoaderloader UnstructuredWordDocumentLoader(example_data/layout-parser-paper.doc)data loader.load()
七、Text 加载器.txt 加载器
在langchain里面.txt只有一个text加载器TextLoader。
from langchain_community.document_loaders import TextLoaderloader TextLoader(example_data/layout-parser-paper.txt)data loader.load()
八、完整代码
下面是通过经验总结的常用文件加载器的函数可直接使用。
from langchain_community.document_loaders import (UnstructuredWordDocumentLoader,CSVLoader,PyPDFLoader,TextLoader,DirectoryLoader,
)
import os
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader# load PDF files from directory
# def load_pdf_from_dir_2(directory_path):
# data []
# for filename in os.listdir(directory_path):
# if filename.endswith(.pdf):
# print(filename)
# # print the file name
# loader PyPDFLoader(f{directory_path}/{filename})
# print(loader)
# data.append(loader.load())
# return data# load PDF files from directory
def load_pdf_from_dir(directory_path):loader PyPDFDirectoryLoader(directory_path)data loader.load()return data# load PDF files from a pdf file
def load_pdf_from_one(filepath):data if filepath.endswith(.pdf):print(filepath)# print the file nameloader PyPDFLoader(f{filepath})print(loader)data loader.load()return data# load Word files(.doc/.docx) from directory
def load_word_from_dir(directory_path):data []for filename in os.listdir(directory_path):# check if the file is a doc or docx file# 检查所有doc以及docx后缀的文件if filename.endswith(.doc) or filename.endswith(.docx):# langchain自带功能加载word文档loader UnstructuredWordDocumentLoader(f{directory_path}/{filename})data.append(loader.load())return data# load Word files(.doc/.docx) from a filename
def load_word_from_one(filename):data if filename.endswith(.doc) or filename.endswith(.docx):print(filename)# print the file nameloader UnstructuredWordDocumentLoader(f{filename})print(loader)data loader.load()return data# load Text files(.txt) from directory
def load_txt_from_dir(directory_path):data []for filename in os.listdir(directory_path):if filename.endswith(.txt):print(filename)loader TextLoader(f{directory_path}/{filename})print(loader)data.append(loader.load())return data# load Text files(.doc/.docx) from a filename
def load_text_from_one(filename):data if filename.endswith(.txt):print(filename)# print the file nameloader TextLoader(f{filename})print(loader)data loader.load()return data# load CSV files(.txt) from directory
def load_csv_from_dir(directory_path):data []for filename in os.listdir(directory_path):if filename.endswith(.csv):print(filename)loader CSVLoader(f{directory_path}/{filename})print(loader)data.append(loader.load())return data# load CSV files(.doc/.docx) from a filename
def load_csv_from_one(filename):data if filename.endswith(.csv):print(filename)# print the file nameloader CSVLoader(f{filename})print(loader)data loader.load()return data# load all files from directory
# param glob **/*.文件后缀 控制要加载的文件
# param show_progress true 显示进度条
# param use_multithreading true 利用多线程
# param loader_cls CSVLoader 指定加载器 | UnstructuredFileLoader
def load_all_from_dir(directory_path, glob, show_progressFalse, use_multithreadingFalse, loader_clsUnstructuredFileLoader):loader DirectoryLoader(directory_path, globglob, show_progressshow_progress, use_multithreadinguse_multithreading, loader_clsloader_cls)data loader.load()return dataif __name__ __main__:res load_pdf_from_dir(./testdir)print(res) 创作不易高抬贵手三连点赞、收藏、关注同学们的满意是我H-大叔的动力。 代码运行有问题或其他建议请在留言区评论看到就会回复不用私聊。 专栏人工智能 | 大模型 | 实战与教程里面还有其他人工智能|大数据方面的文章可继续食用持续更新。