羲和数据集收集器0.8

为了使代码能够识别并提取文档中的内容，我们需要进一步优化文本解析和问答对提取的逻辑。需要确保能够正确识别和提取其中的问答对。
以下是对代码的进一步完善：

python

import os
import json
import fitz  # PyMuPDF
import docx
import re
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog
import logging# 设置日志记录
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')def clean_text(text):""" 清理文本，去除多余的空白字符和标点符号 """text = re.sub(r'\s+', ' ', text)  # 合并多个空白字符text = re.sub(r'[^\w\s]', '', text)  # 去除标点符号return text.strip()def extract_qa_pairs_from_txt(file_path):with open(file_path, 'r', encoding='utf-8') as f:content = f.read()content = clean_text(content)qa_pairs = extract_qa_pairs_from_content(content)return qa_pairsdef extract_qa_pairs_from_docx(file_path):doc = docx.Document(file_path)content = '\n'.join([para.text for para in doc.paragraphs])content = clean_text(content)qa_pairs = extract_qa_pairs_from_content(content)return qa_pairsdef extract_qa_pairs_from_pdf(file_path):doc = fitz.open(file_path)content = ''for page_num in range(len(doc)):page = doc.load_page(page_num)content += page.get_text()content = clean_text(content)qa_pairs = extract_qa_pairs_from_content(content)return qa_pairsdef extract_qa_pairs_from_content(content):qa_pairs = []lines = content.split('\n')current_question = Nonecurrent_answer = []for line in lines:# 检查是否为新的问答对或定义开始if re.match(r'^\d+\.\s+', line) or re.match(r'^Q:\s+', line) or re.match(r'^问题:\s+', line) or re.match(r'^\w+:\s+', line):if current_question and current_answer:qa_pairs.append({'question': current_question, 'xihe_answers': [' '.join(current_answer)], 'ling_answers': [' '.join(current_answer)]})current_question = re.sub(r'^\d+\.\s+', '', line)current_question = re.sub(r'^(Q:|问题:|[\w\s]+:)\s+', '', current_question)current_answer = []elif re.match(r'^\d+\.\d+\s+', line) or re.match(r'^$$\d+$$\s+', line) or re.match(r'^注 \d+:', line):  # 子章节或注释开始if current_answer:current_answer.append(line)else:current_answer.append(line)if current_question and current_answer:qa_pairs.append({'question': current_question, 'xihe_answers': [' '.join(current_answer)], 'ling_answers': [' '.join(current_answer)]})return qa_pairsdef extract_qa_pairs_from_file(file_path):if file_path.endswith('.txt'):return extract_qa_pairs_from_txt(file_path)elif file_path.endswith('.docx'):return extract_qa_pairs_from_docx(file_path)elif file_path.endswith('.pdf'):return extract_qa_pairs_from_pdf(file_path)else:raise ValueError("Unsupported file type")def write_qa_pairs_to_file(qa_pairs, output_file):with open(output_file, 'w', encoding='utf-8') as f:for pair in qa_pairs:f.write(json.dumps(pair, ensure_ascii=False) + '\n')def generate_output_filename(files):base_names = [os.path.splitext(os.path.basename(file))[0] for file in files]if len(base_names) == 1:return f"{base_names[0]}_qa.txt"else:return "_".join(base_names) + "_qa.txt"def process_files(files, output_file):qa_pairs = []for file in files:try:logging.info(f"Processing file: {file}")qa_pairs.extend(extract_qa_pairs_from_file(file))except Exception as e:logging.error(f"Error processing {file}: {e}")messagebox.showerror("错误", f"处理文件时出错: {file}\n{e}")write_qa_pairs_to_file(qa_pairs, output_file)num_lines = sum(1 for line in open(output_file, 'r', encoding='utf-8'))messagebox.showinfo("完成", f"提取了问答对并保存到 {output_file}\n共 {num_lines} 行数据")def select_files():global filesfiles = filedialog.askopenfilenames(filetypes=[("Text files", "*.txt"), ("Word files", "*.docx"), ("PDF files", "*.pdf")])if files:file_list_label.config(text="\n".join(files))def start_processing():if not files:messagebox.showerror("错误", "请选择输入文件")returnoutput_file = generate_output_filename(files)process_files(files, output_file)# 创建主窗口
root = tk.Tk()
root.title("问答对提取工具")# 文件选择
select_files_button = tk.Button(root, text="选择文件", command=select_files)
select_files_button.pack(pady=10)
file_list_label = tk.Label(root, text="")
file_list_label.pack()# 开始处理按钮
start_button = tk.Button(root, text="开始处理", command=start_processing)
start_button.pack(pady=20)# 运行主循环
root.mainloop()

优化内容提取逻辑
为了更好地处理你提供的示例内容，我们需要调整 extract_qa_pairs_from_content 函数，使其能够更灵活地识别和提取问答对。以下是一些可能的优化：

识别标题和段落：识别文档中的标题和段落，将其作为潜在的问答对。
处理多行答案：确保多行答案能够正确合并。
python

def extract_qa_pairs_from_content(content):qa_pairs = []lines = content.split('\n')current_question = Nonecurrent_answer = []for line in lines:# 检查是否为新的问答对或定义开始if re.match(r'^\d+\.\s+', line) or re.match(r'^Q:\s+', line) or re.match(r'^问题:\s+', line) or re.match(r'^\w+:\s+', line):if current_question and current_answer:qa_pairs.append({'question': current_question, 'xihe_answers': [' '.join(current_answer)], 'ling_answers': [' '.join(current_answer)]})current_question = re.sub(r'^\d+\.\s+', '', line)current_question = re.sub(r'^(Q:|问题:|[\w\s]+:)\s+', '', current_question)current_answer = []elif re.match(r'^\d+\.\d+\s+', line) or re.match(r'^$$\d+$$\s+', line) or re.match(r'^注 \d+:', line):  # 子章节或注释开始if current_answer:current_answer.append(line)else:current_answer.append(line)if current_question and current_answer:qa_pairs.append({'question': current_question, 'xihe_answers': [' '.join(current_answer)], 'ling_answers': [' '.join(current_answer)]})# 处理没有明确标记的问答对if not qa_pairs:for line in lines:if ':' in line:parts = line.split(':', 1)if len(parts) == 2:question, answer = partsqa_pairs.append({'question': question.strip(), 'xihe_answers': [answer.strip()], 'ling_answers': [answer.strip()]})return qa_pairs

说明：
识别标题和段落：通过检查行中是否包含 : 来识别潜在的问答对。
处理多行答案：确保多行答案能够正确合并。
运行步骤：
确保安装了所需的库：pip install pymupdf python-docx
将上述代码保存为一个Python文件，例如 qa_extractor_gui.py。
运行该文件：python qa_extractor_gui.py。
使用GUI选择一个或多个文件，点击“开始处理”按钮进行文件处理。
这样，你就可以通过一个友好的图形界面来选择文件，并进行问答对的提取和保存。希望这些改进能更好地处理你提供的示例内容！

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.xdnf.cn/news/10438.html

如若内容造成侵权/违法违规/事实不符，请联系一条长河网进行投诉反馈，一经查实，立即删除！