长文本处理技术综述:突破上下文限制
前言
大模型的上下文窗口是有限的,但很多应用场景需要处理超长文本。如何高效处理长文本是大模型应用开发中的重要挑战。
我在项目中处理过各种长文本场景,从法律文档分析到代码仓库理解。今天分享一些常用的长文本处理技术。
文本分块技术
基于长度的分块
class FixedSizeChunker: """固定大小分块""" def __init__(self, chunk_size: int = 512, overlap: int = 50): self.chunk_size = chunk_size self.overlap = overlap def chunk(self, text: str) -> list: """分块""" words = text.split() chunks = [] for i in range(0, len(words), self.chunk_size - self.overlap): chunk_words = words[i:i + self.chunk_size] chunks.append(" ".join(chunk_words)) return chunks基于语义的分块
import re class SemanticChunker: """语义分块""" def __init__(self, max_tokens: int = 512): self.max_tokens = max_tokens def chunk(self, text: str) -> list: """按语义边界分块""" # 按段落分割 paragraphs = re.split(r'\n\n+', text) chunks = [] current_chunk = [] current_size = 0 for para in paragraphs: para_size = self._count_tokens(para) if current_size + para_size > self.max_tokens: if current_chunk: chunks.append("\n\n".join(current_chunk)) current_chunk = [para] current_size = para_size else: current_chunk.append(para) current_size += para_size if current_chunk: chunks.append("\n\n".join(current_chunk)) return chunks def _count_tokens(self, text: str) -> int: """估算 token 数量""" return len(text) // 4Map-Reduce 策略
class MapReduceProcessor: """Map-Reduce 处理长文本""" def __init__(self, llm, chunk_size: int = 512): self.llm = llm self.chunker = FixedSizeChunker(chunk_size=chunk_size) def process(self, task: str, text: str) -> str: """处理长文本""" # Map 阶段:处理每个块 chunks = self.chunker.chunk(text) summaries = [] for i, chunk in enumerate(chunks): prompt = f"""任务:{task} 文本片段 {i+1}/{len(chunks)}: {chunk} 请提取与任务相关的信息:""" summary = self.llm.generate(prompt) summaries.append(summary) # Reduce 阶段:合并结果 combined = "\n\n".join(summaries) prompt = f"""任务:{task} 各部分分析结果: {combined} 请综合以上信息给出最终回答:""" return self.llm.generate(prompt)滑动窗口技术
class SlidingWindowProcessor: """滑动窗口处理""" def __init__(self, llm, window_size: int = 512, step: int = 256): self.llm = llm self.window_size = window_size self.step = step def process(self, task: str, text: str) -> str: """滑动窗口处理""" words = text.split() results = [] for i in range(0, len(words), self.step): window_words = words[i:i + self.window_size] window_text = " ".join(window_words) prompt = f"""任务:{task} 文本:{window_text} 分析:""" result = self.llm.generate(prompt) results.append(result) # 综合结果 return self._synthesize(results, task) def _synthesize(self, results: list, task: str) -> str: """综合结果""" combined = "\n\n".join(results) prompt = f"""基于以下分析结果,给出综合回答: {combined} 任务:{task} 综合回答:""" return self.llm.generate(prompt)递归总结
class RecursiveSummarizer: """递归总结""" def __init__(self, llm, target_length: int = 500): self.llm = llm self.target_length = target_length def summarize(self, text: str) -> str: """递归总结""" current_length = self._count_tokens(text) if current_length <= self.target_length: return text # 分割为两部分 words = text.split() mid = len(words) // 2 left = " ".join(words[:mid]) right = " ".join(words[mid:]) # 递归总结 left_summary = self.summarize(left) right_summary = self.summarize(right) # 合并 combined = f"{left_summary}\n\n{right_summary}" prompt = f"""请总结以下内容: {combined} 总结:""" return self.llm.generate(prompt) def _count_tokens(self, text: str) -> int: return len(text) // 4实际应用
class LongTextAnalyzer: """长文本分析器""" def __init__(self, llm): self.llm = llm self.processor = MapReduceProcessor(llm) def analyze_document(self, document_path: str, task: str) -> str: """分析文档""" with open(document_path, "r") as f: text = f.read() return self.processor.process(task, text) def answer_question(self, document_path: str, question: str) -> str: """基于文档回答问题""" with open(document_path, "r") as f: text = f.read() # 使用 RAG 风格 chunks = FixedSizeChunker().chunk(text) # 找到最相关的块 relevant_chunks = self._find_relevant(chunks, question) # 基于相关块回答 context = "\n\n".join(relevant_chunks) prompt = f"""基于以下内容回答问题: {context} 问题:{question} 回答:""" return self.llm.generate(prompt) def _find_relevant(self, chunks: list, query: str) -> list: """找到相关的块""" # 简化实现:返回前 3 个块 return chunks[:3]总结
长文本处理技术:
- 分块策略:固定大小 vs 语义分块
- Map-Reduce:分而治之的经典方法
- 滑动窗口:处理连续文本
- 递归总结:层次化压缩
关键要点:
- 根据任务选择合适的分块策略
- 保持一定的重叠避免信息丢失
- 对于问答任务,优先检索相关部分
- 考虑使用向量数据库进行语义检索