2026-01-18 22:04:56 +08:00
|
|
|
|
using JiebaNet.Analyser;
|
2025-12-29 20:30:15 +08:00
|
|
|
|
using JiebaNet.Segmenter;
|
2026-01-18 22:04:56 +08:00
|
|
|
|
|
2026-01-28 11:19:23 +08:00
|
|
|
|
namespace Service.AI;
|
2025-12-29 20:30:15 +08:00
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// 文本分词服务接口
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public interface ITextSegmentService
|
|
|
|
|
|
{
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// 从文本中提取关键词
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="text">待分析的文本</param>
|
|
|
|
|
|
/// <param name="topN">返回前N个关键词,默认5个</param>
|
|
|
|
|
|
/// <returns>关键词列表</returns>
|
|
|
|
|
|
List<string> ExtractKeywords(string text, int topN = 5);
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// 对文本进行分词
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
/// <param name="text">待分词的文本</param>
|
|
|
|
|
|
/// <returns>分词结果列表</returns>
|
|
|
|
|
|
List<string> Segment(string text);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// 基于 JiebaNet 的文本分词服务实现
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class TextSegmentService : ITextSegmentService
|
|
|
|
|
|
{
|
|
|
|
|
|
private readonly JiebaSegmenter _segmenter;
|
|
|
|
|
|
private readonly TfidfExtractor _extractor;
|
|
|
|
|
|
private readonly ILogger<TextSegmentService> _logger;
|
|
|
|
|
|
|
|
|
|
|
|
public TextSegmentService(ILogger<TextSegmentService> logger)
|
|
|
|
|
|
{
|
|
|
|
|
|
_logger = logger;
|
|
|
|
|
|
_segmenter = new JiebaSegmenter();
|
|
|
|
|
|
_extractor = new TfidfExtractor();
|
2026-01-30 10:41:19 +08:00
|
|
|
|
|
2025-12-29 20:30:15 +08:00
|
|
|
|
// 仅添加JiebaNet词典中可能缺失的特定业务词汇
|
|
|
|
|
|
AddCustomWords();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// 添加自定义词典 - 仅添加JiebaNet词典中可能缺失的特定词汇
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
private void AddCustomWords()
|
|
|
|
|
|
{
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
// 只添加可能缺失的特定业务词汇
|
|
|
|
|
|
// 大部分常用词(如"美团"、"支付宝"等)JiebaNet已内置
|
|
|
|
|
|
var customWords = new[]
|
|
|
|
|
|
{
|
|
|
|
|
|
"水电费", "物业费", "燃气费" // 复合词,确保作为整体识别 // TODO 做成配置文件 让 AI定期提取复合词汇填入到这边
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
foreach (var word in customWords)
|
|
|
|
|
|
{
|
|
|
|
|
|
_segmenter.AddWord(word);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (customWords.Length > 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
_logger.LogDebug("已加载 {Count} 个自定义词汇", customWords.Length);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
_logger.LogWarning(ex, "添加自定义词典失败");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public List<string> ExtractKeywords(string text, int topN = 5)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (string.IsNullOrWhiteSpace(text))
|
|
|
|
|
|
{
|
2026-01-18 22:04:56 +08:00
|
|
|
|
return [];
|
2025-12-29 20:30:15 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
// 使用 TF-IDF 算法提取关键词(已内置停用词过滤)
|
|
|
|
|
|
var keywords = _extractor.ExtractTags(text, topN, new List<string>());
|
|
|
|
|
|
|
|
|
|
|
|
// 过滤单字,保留有意义的词
|
|
|
|
|
|
var filteredKeywords = keywords
|
|
|
|
|
|
.Where(k => k.Length >= 2)
|
|
|
|
|
|
.Distinct()
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
|
|
// 如果过滤后没有关键词,使用基础分词并选择最长的词
|
|
|
|
|
|
if (filteredKeywords.Count == 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
var segments = Segment(text);
|
|
|
|
|
|
filteredKeywords = segments
|
|
|
|
|
|
.Where(s => s.Length >= 2)
|
|
|
|
|
|
.OrderByDescending(s => s.Length)
|
|
|
|
|
|
.Take(topN)
|
|
|
|
|
|
.Distinct()
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果还是没有,返回原文的前10个字符
|
|
|
|
|
|
if (filteredKeywords.Count == 0 && text.Length > 0)
|
|
|
|
|
|
{
|
|
|
|
|
|
filteredKeywords.Add(text.Length > 10 ? text.Substring(0, 10) : text);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-01-30 10:41:19 +08:00
|
|
|
|
_logger.LogDebug("从文本 '{Text}' 中提取关键词: {Keywords}",
|
2025-12-29 20:30:15 +08:00
|
|
|
|
text, string.Join(", ", filteredKeywords));
|
|
|
|
|
|
|
|
|
|
|
|
return filteredKeywords;
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
_logger.LogError(ex, "提取关键词失败,文本: {Text}", text);
|
|
|
|
|
|
// 降级处理:返回原文
|
2026-01-18 22:04:56 +08:00
|
|
|
|
return [text.Length > 10 ? text.Substring(0, 10) : text];
|
2025-12-29 20:30:15 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public List<string> Segment(string text)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (string.IsNullOrWhiteSpace(text))
|
|
|
|
|
|
{
|
2026-01-18 22:04:56 +08:00
|
|
|
|
return [];
|
2025-12-29 20:30:15 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
// 执行分词
|
|
|
|
|
|
var segments = _segmenter.Cut(text).ToList();
|
|
|
|
|
|
|
|
|
|
|
|
// 过滤空白和停用词
|
|
|
|
|
|
var filteredSegments = segments
|
|
|
|
|
|
.Where(s => !string.IsNullOrWhiteSpace(s) && s.Trim().Length > 0)
|
|
|
|
|
|
.Select(s => s.Trim())
|
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
|
|
return filteredSegments;
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
_logger.LogError(ex, "分词失败,文本: {Text}", text);
|
2026-01-18 22:04:56 +08:00
|
|
|
|
return [text];
|
2025-12-29 20:30:15 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|