给定一些短词字符串作为分割词,去分割一段长字符串。从前往后遍历分割词,查找并分割长字符串为对应的token。分词规则如下:
1.优先匹配最长分割词:若多个分割词可匹配同一位置,选择长度最长的;长度相同时,按字典序较大的优先。
2.未匹配部分保留原样:无法匹配分割词的部分直接作为独立token输出。
3.输出格式:每个token用括号包裹,按原字符串顺序输出。
输入描述:
短词字符串列表,每行一个,空行后输入待分割的长字符串。
输出描述:
括号包裹的分词结果,如(token1)(token2)
示例1:
输入:
zhong guo
zhong
guo
wo
mei guo
wo ai zhong guo mei guo ye xing
输出:
(wo)(ai)(zhong guo)(mei guo)(ye)(xing)
问题分析
给定一组分割词和一个长字符串,需要按照特定规则将长字符串分割为多个token。规则包括优先匹配最长分割词、字典序排序以及未匹配部分保留原样。输出格式要求用括号包裹每个token。
解决思路
- 分割词预处理:将分割词按长度从长到短排序,长度相同的按字典序降序排列。
- 遍历分割词:从长到短依次检查长字符串中是否有匹配的分割词。
- 分割字符串:匹配到的部分作为token,未匹配的部分保留原样。
- 输出结果:将所有token按顺序用括号包裹输出。
代码实现
以下是Python、C++、Java、JavaScript和C语言的实现代码。
Python实现
def tokenize(segments, text): # 预处理分割词:按长度降序,长度相同按字典序降序 segments.sort(key=lambda x: (-len(x), x), reverse=False) tokens = [] i = 0 n = len(text) while i < n: matched = False for seg in segments: seg_len = len(seg) if i + seg_len <= n and text[i:i+seg_len] == seg: tokens.append(seg) i += seg_len matched = True break if not matched: # 处理未匹配部分(按空格分割) if text[i] == ' ': i += 1 else: j = i while j < n and text[j] != ' ': j += 1 tokens.append(text[i:j]) i = j return tokens # 读取输入 segments = [] while True: line = input().strip() if line == '': break segments.append(line) text = input().strip() # 分词 tokens = tokenize(segments, text) # 输出结果 print(''.join(f'({token})' for token in tokens))C++实现
#include <iostream> #include <vector> #include <algorithm> #include <string> using namespace std; vector<string> tokenize(vector<string>& segments, string text) { // 预处理分割词:按长度降序,长度相同按字典序降序 sort(segments.begin(), segments.end(), [](const string& a, const string& b) { if (a.length() == b.length()) { return a > b; } return a.length() > b.length(); }); vector<string> tokens; int i = 0; int n = text.length(); while (i < n) { bool matched = false; for (const string& seg : segments) { int seg_len = seg.length(); if (i + seg_len <= n && text.substr(i, seg_len) == seg) { tokens.push_back(seg); i += seg_len; matched = true; break; } } if (!matched) { if (text[i] == ' ') { i++; } else { int j = i; while (j < n && text[j] != ' ') { j++; } tokens.push_back(text.substr(i, j - i)); i = j; } } } return tokens; } int main() { vector<string> segments; string line; while (getline(cin, line)) { if (line.empty()) { break; } segments.push_back(line); } string text; getline(cin, text); vector<string> tokens = tokenize(segments, text); for (const string& token : tokens) { cout << "(" << token << ")"; } cout << endl; return 0; }Java实现
import java.util.*; public class Main { public static List<String> tokenize(List<String> segments, String text) { // 预处理分割词:按长度降序,长度相同按字典序降序 segments.sort((a, b) -> { if (a.length() == b.length()) { return b.compareTo(a); } return Integer.compare(b.length(), a.length()); }); List<String> tokens = new ArrayList<>(); int i = 0; int n = text.length(); while (i < n) { boolean matched = false; for (String seg : segments) { int segLen = seg.length(); if (i + segLen <= n && text.substring(i, i + segLen).equals(seg)) { tokens.add(seg); i += segLen; matched = true; break; } } if (!matched) { if (text.charAt(i) == ' ') { i++; } else { int j = i; while (j < n && text.charAt(j) != ' ') { j++; } tokens.add(text.substring(i, j)); i = j; } } } return tokens; } public static void main(String[] args) { Scanner scanner = new Scanner(System.in); List<String> segments = new ArrayList<>(); while (true) { String line = scanner.nextLine(); if (line.isEmpty()) { break; } segments.add(line); } String text = scanner.nextLine(); List<String> tokens = tokenize(segments, text); for (String token : tokens) { System.out.print("(" + token + ")"); } System.out.println(); } }JavaScript实现
function tokenize(segments, text) { // 预处理分割词:按长度降序,长度相同按字典序降序 segments.sort((a, b) => { if (a.length === b.length) { return b.localeCompare(a); } return b.length - a.length; }); const tokens = []; let i = 0; const n = text.length; while (i < n) { let matched = false; for (const seg of segments) { const segLen = seg.length; if (i + segLen <= n && text.substring(i, i + segLen) === seg) { tokens.push(seg); i += segLen; matched = true; break; } } if (!matched) { if (text[i] === ' ') { i++; } else { let j = i; while (j < n && text[j] !== ' ') { j++; } tokens.push(text.substring(i, j)); i = j; } } } return tokens; } // 读取输入 const readline = require('readline'); const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); const segments = []; let text = ''; let isText = false; rl.on('line', (line) => { if (line === '') { isText = true; } else if (!isText) { segments.push(line); } else { text = line; rl.close(); } }).on('close', () => { const tokens = tokenize(segments, text); console.log(tokens.map(token => `(${token})`).join('')); });C语言实现
#include <stdio.h> #include <stdlib.h> #include <string.h> typedef struct { char **segments; int count; } SegmentList; int compare_segments(const void *a, const void *b) { const char *seg_a = *(const char **)a; const char *seg_b = *(const char **)b; int len_a = strlen(seg_a); int len_b = strlen(seg_b); if (len_a == len_b) { return strcmp(seg_b, seg_a); } return len_b - len_a; } void tokenize(SegmentList *segments, const char *text, char ***tokens, int *token_count) { *tokens = NULL; *token_count = 0; int i = 0; int n = strlen(text); while (i < n) { int matched = 0; for (int j = 0; j < segments->count; j++) { const char *seg = segments->segments[j]; int seg_len = strlen(seg); if (i + seg_len <= n && strncmp(text + i, seg, seg_len) == 0) { (*token_count)++; *tokens = realloc(*tokens, *token_count * sizeof(char *)); (*tokens)[*token_count - 1] = strdup(seg); i += seg_len; matched = 1; break; } } if (!matched) { if (text[i] == ' ') { i++; } else { int j = i; while (j < n && text[j] != ' ') { j++; } (*token_count)++; *tokens = realloc(*tokens, *token_count * sizeof(char *)); (*tokens)[*token_count - 1] = strndup(text + i, j - i); i = j; } } } } int main() { SegmentList segments; segments.segments = NULL; segments.count = 0; char line[1024]; while (fgets(line, sizeof(line), stdin)) { line[strcspn(line, "\n")] = '\0'; if (line[0] == '\0') { break; } segments.count++; segments.segments = realloc(segments.segments, segments.count * sizeof(char *)); segments.segments[segments.count - 1] = strdup(line); } fgets(line, sizeof(line), stdin); line[strcspn(line, "\n")] = '\0'; const char *text = line; qsort(segments.segments, segments.count, sizeof(char *), compare_segments); char **tokens = NULL; int token_count = 0; tokenize(&segments, text, &tokens, &token_count); for (int i = 0; i < token_count; i++) { printf("(%s)", tokens[i]); free(tokens[i]); } printf("\n"); free(tokens); for (int i = 0; i < segments.count; i++) { free(segments.segments[i]); } free(segments.segments); return 0; }代码说明
- 预处理分割词:将分割词按长度和字典序排序,确保优先匹配最长且字典序较大的分割词。
- 分割字符串:遍历长字符串,依次检查是否有匹配的分割词,匹配到的部分作为token,未匹配的部分按空格分割。
- 输出结果:将所有token用括号包裹并按顺序输出。
以上代码实现了题目要求的功能,适用于多种编程语言。