Files
EmailBill/Service/AI/TextSegmentService.cs
SunCheng 704f58b1a1
All checks were successful
Docker Build & Deploy / Build Docker Image (push) Successful in 24s
Docker Build & Deploy / Deploy to Production (push) Successful in 6s
Docker Build & Deploy / Cleanup Dangling Images (push) Successful in 1s
Docker Build & Deploy / WeChat Notification (push) Successful in 3s
fix
2026-01-30 10:41:19 +08:00

152 lines
4.5 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using JiebaNet.Analyser;
using JiebaNet.Segmenter;
namespace Service.AI;
/// <summary>
/// 文本分词服务接口
/// </summary>
public interface ITextSegmentService
{
/// <summary>
/// 从文本中提取关键词
/// </summary>
/// <param name="text">待分析的文本</param>
/// <param name="topN">返回前N个关键词默认5个</param>
/// <returns>关键词列表</returns>
List<string> ExtractKeywords(string text, int topN = 5);
/// <summary>
/// 对文本进行分词
/// </summary>
/// <param name="text">待分词的文本</param>
/// <returns>分词结果列表</returns>
List<string> Segment(string text);
}
/// <summary>
/// 基于 JiebaNet 的文本分词服务实现
/// </summary>
public class TextSegmentService : ITextSegmentService
{
private readonly JiebaSegmenter _segmenter;
private readonly TfidfExtractor _extractor;
private readonly ILogger<TextSegmentService> _logger;
public TextSegmentService(ILogger<TextSegmentService> logger)
{
_logger = logger;
_segmenter = new JiebaSegmenter();
_extractor = new TfidfExtractor();
// 仅添加JiebaNet词典中可能缺失的特定业务词汇
AddCustomWords();
}
/// <summary>
/// 添加自定义词典 - 仅添加JiebaNet词典中可能缺失的特定词汇
/// </summary>
private void AddCustomWords()
{
try
{
// 只添加可能缺失的特定业务词汇
// 大部分常用词(如"美团"、"支付宝"等JiebaNet已内置
var customWords = new[]
{
"水电费", "物业费", "燃气费" // 复合词,确保作为整体识别 // TODO 做成配置文件 让 AI定期提取复合词汇填入到这边
};
foreach (var word in customWords)
{
_segmenter.AddWord(word);
}
if (customWords.Length > 0)
{
_logger.LogDebug("已加载 {Count} 个自定义词汇", customWords.Length);
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "添加自定义词典失败");
}
}
public List<string> ExtractKeywords(string text, int topN = 5)
{
if (string.IsNullOrWhiteSpace(text))
{
return [];
}
try
{
// 使用 TF-IDF 算法提取关键词(已内置停用词过滤)
var keywords = _extractor.ExtractTags(text, topN, new List<string>());
// 过滤单字,保留有意义的词
var filteredKeywords = keywords
.Where(k => k.Length >= 2)
.Distinct()
.ToList();
// 如果过滤后没有关键词,使用基础分词并选择最长的词
if (filteredKeywords.Count == 0)
{
var segments = Segment(text);
filteredKeywords = segments
.Where(s => s.Length >= 2)
.OrderByDescending(s => s.Length)
.Take(topN)
.Distinct()
.ToList();
}
// 如果还是没有返回原文的前10个字符
if (filteredKeywords.Count == 0 && text.Length > 0)
{
filteredKeywords.Add(text.Length > 10 ? text.Substring(0, 10) : text);
}
_logger.LogDebug("从文本 '{Text}' 中提取关键词: {Keywords}",
text, string.Join(", ", filteredKeywords));
return filteredKeywords;
}
catch (Exception ex)
{
_logger.LogError(ex, "提取关键词失败,文本: {Text}", text);
// 降级处理:返回原文
return [text.Length > 10 ? text.Substring(0, 10) : text];
}
}
public List<string> Segment(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
return [];
}
try
{
// 执行分词
var segments = _segmenter.Cut(text).ToList();
// 过滤空白和停用词
var filteredSegments = segments
.Where(s => !string.IsNullOrWhiteSpace(s) && s.Trim().Length > 0)
.Select(s => s.Trim())
.ToList();
return filteredSegments;
}
catch (Exception ex)
{
_logger.LogError(ex, "分词失败,文本: {Text}", text);
return [text];
}
}
}