Files
EmailBill/Service/AI/TextSegmentService.cs

152 lines
4.5 KiB
C#
Raw Normal View History

2026-01-18 22:04:56 +08:00
using JiebaNet.Analyser;
2025-12-29 20:30:15 +08:00
using JiebaNet.Segmenter;
2026-01-18 22:04:56 +08:00
namespace Service.AI;
2025-12-29 20:30:15 +08:00
/// <summary>
/// 文本分词服务接口
/// </summary>
public interface ITextSegmentService
{
/// <summary>
/// 从文本中提取关键词
/// </summary>
/// <param name="text">待分析的文本</param>
/// <param name="topN">返回前N个关键词默认5个</param>
/// <returns>关键词列表</returns>
List<string> ExtractKeywords(string text, int topN = 5);
/// <summary>
/// 对文本进行分词
/// </summary>
/// <param name="text">待分词的文本</param>
/// <returns>分词结果列表</returns>
List<string> Segment(string text);
}
/// <summary>
/// 基于 JiebaNet 的文本分词服务实现
/// </summary>
public class TextSegmentService : ITextSegmentService
{
private readonly JiebaSegmenter _segmenter;
private readonly TfidfExtractor _extractor;
private readonly ILogger<TextSegmentService> _logger;
public TextSegmentService(ILogger<TextSegmentService> logger)
{
_logger = logger;
_segmenter = new JiebaSegmenter();
_extractor = new TfidfExtractor();
2026-01-01 12:32:08 +08:00
2025-12-29 20:30:15 +08:00
// 仅添加JiebaNet词典中可能缺失的特定业务词汇
AddCustomWords();
}
/// <summary>
/// 添加自定义词典 - 仅添加JiebaNet词典中可能缺失的特定词汇
/// </summary>
private void AddCustomWords()
{
try
{
// 只添加可能缺失的特定业务词汇
// 大部分常用词(如"美团"、"支付宝"等JiebaNet已内置
var customWords = new[]
{
"水电费", "物业费", "燃气费" // 复合词,确保作为整体识别 // TODO 做成配置文件 让 AI定期提取复合词汇填入到这边
};
foreach (var word in customWords)
{
_segmenter.AddWord(word);
}
if (customWords.Length > 0)
{
_logger.LogDebug("已加载 {Count} 个自定义词汇", customWords.Length);
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "添加自定义词典失败");
}
}
public List<string> ExtractKeywords(string text, int topN = 5)
{
if (string.IsNullOrWhiteSpace(text))
{
2026-01-18 22:04:56 +08:00
return [];
2025-12-29 20:30:15 +08:00
}
try
{
// 使用 TF-IDF 算法提取关键词(已内置停用词过滤)
var keywords = _extractor.ExtractTags(text, topN, new List<string>());
// 过滤单字,保留有意义的词
var filteredKeywords = keywords
.Where(k => k.Length >= 2)
.Distinct()
.ToList();
// 如果过滤后没有关键词,使用基础分词并选择最长的词
if (filteredKeywords.Count == 0)
{
var segments = Segment(text);
filteredKeywords = segments
.Where(s => s.Length >= 2)
.OrderByDescending(s => s.Length)
.Take(topN)
.Distinct()
.ToList();
}
// 如果还是没有返回原文的前10个字符
if (filteredKeywords.Count == 0 && text.Length > 0)
{
filteredKeywords.Add(text.Length > 10 ? text.Substring(0, 10) : text);
}
_logger.LogDebug("从文本 '{Text}' 中提取关键词: {Keywords}",
text, string.Join(", ", filteredKeywords));
return filteredKeywords;
}
catch (Exception ex)
{
_logger.LogError(ex, "提取关键词失败,文本: {Text}", text);
// 降级处理:返回原文
2026-01-18 22:04:56 +08:00
return [text.Length > 10 ? text.Substring(0, 10) : text];
2025-12-29 20:30:15 +08:00
}
}
public List<string> Segment(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
2026-01-18 22:04:56 +08:00
return [];
2025-12-29 20:30:15 +08:00
}
try
{
// 执行分词
var segments = _segmenter.Cut(text).ToList();
// 过滤空白和停用词
var filteredSegments = segments
.Where(s => !string.IsNullOrWhiteSpace(s) && s.Trim().Length > 0)
.Select(s => s.Trim())
.ToList();
return filteredSegments;
}
catch (Exception ex)
{
_logger.LogError(ex, "分词失败,文本: {Text}", text);
2026-01-18 22:04:56 +08:00
return [text];
2025-12-29 20:30:15 +08:00
}
}
}