diff --git a/ResourcesInitializerUtil.java b/ResourcesInitializerUtil.java new file mode 100644 index 0000000000000000000000000000000000000000..84f1bba528510bf97381833ab175e75146b71ef3 --- /dev/null +++ b/ResourcesInitializerUtil.java @@ -0,0 +1,145 @@ +package com.huawei.ci.portal.provider.utils; + +import com.huawei.ci.portal.provider.enums.assistanthelper.AssistantConstants; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; + +import java.io.*; +import java.util.*; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.stream.Collectors; + +/** + * @Description: 资源文件初始化处理类 + * @ClassName: com.huawei.ci.portal.provider.utils + * @Author: hwx1123794/hexiangyun + * @DateTime: 2022/7/21 10:04 + * @Params: + **/ +@Data +@AllArgsConstructor +@Component +public class ResourcesInitializerUtil { + + private static final Logger logger = LoggerFactory.getLogger(ResourcesInitializerUtil.class); + + private static StringBuilder replaceAll; + + private static String encoding = AssistantConstants.ENCODING; + + private static String replaceStr = AssistantConstants.BE_REPLACE_STR; + + private static int replaceSize = AssistantConstants.BE_REPLACE_SIZE; + + private static String fileName = AssistantConstants.FILE_PATH; + + private static HashSet strings = new HashSet<>(100); + + private static Set allSensitiveWordSet = new HashSet<>(); + + private static Map afterDealingSensitiveWords = new HashMap<>(); + + + /** + * 文件 + * + * @param fileName 词库文件名(含后缀) + */ + public ResourcesInitializerUtil(String fileName) { + this.fileName = fileName; + } + + + /** + * 初始化敏感词库 + * + * @return + */ + protected Map InitializationWorkUtil() { + replaceAll = new StringBuilder(replaceSize); + for (int x = 0; x < replaceSize; x++) { + replaceAll.append(replaceStr); + } + InputStreamReader read = null; + BufferedReader bufferedReader = null; + ReentrantReadWriteLock reentrantReadWriteLock = new ReentrantReadWriteLock(); + // 读锁加锁 + reentrantReadWriteLock.readLock().lock(); + try { + logger.info("Init sensitive-words configuration start "); + read = new InputStreamReader(Objects.requireNonNull(ResourcesInitializerUtil.class.getClassLoader() + .getResourceAsStream(fileName)), encoding); + // 缓冲区大小 手动设置为80K + bufferedReader = new BufferedReader(read, 81920); + allSensitiveWordSet = bufferedReader.lines().collect(Collectors.toSet()); + // 构造属性 + afterDealingSensitiveWords = addSensitiveWords2HashConstruct(allSensitiveWordSet); + logger.info("Init sensitive-words configuration success "); + + } catch (IOException e) { + logger.error("read file failed"); + } finally { + try { + if (null != bufferedReader) + // 缓冲流关闭 + bufferedReader.close(); + } catch (IOException e) { + logger.error("shutdown Buffered streamline failed, caused by: " + e); + } + try { + if (null != read) + // IO流关闭 + read.close(); + } catch (IOException e) { + logger.error("shutdown IO streamline failed , caused by: " + e); + } + // 释放 + reentrantReadWriteLock.readLock().unlock(); + } + return afterDealingSensitiveWords; + } + + + /** + * DFA算法构造敏感词树形结构 (凡是相同字符开头的都在同一个HashMap + 树形结构 ) + * + * @param sensitiveWords 从文件中读取的敏感词集合 + * @return + */ + private HashMap addSensitiveWords2HashConstruct(Set sensitiveWords) { + // 关键字 整理成 Hash->树形 结构。 + HashMap sensitiveWordsMap = new HashMap(sensitiveWords.size()); + String currentWord = null; + Map childMap = null; + Map newWordMap = new HashMap<>(); + // 处理敏感词 + Iterator iterator = sensitiveWords.iterator(); + while (iterator.hasNext()) { + currentWord = iterator.next(); + childMap = sensitiveWordsMap; + // 关键字构造树形结构 + for (int i = 0; i < currentWord.length(); i++) { + char key = currentWord.charAt(i); + Object wordMap = childMap.get(key); + if (wordMap != null) { + childMap = (Map) wordMap; + } else { + newWordMap = new HashMap<>(); + // 添加标记位 + newWordMap.put("isEnd", "0"); + childMap.put(key, newWordMap); + childMap = newWordMap; + } + // 最后一位 + if (i == currentWord.length() - 1) { + childMap.put("isEnd", "1"); + } + } + } + return sensitiveWordsMap; + } +} diff --git a/SensitiveWordsFilterCheckerUtil.java b/SensitiveWordsFilterCheckerUtil.java new file mode 100644 index 0000000000000000000000000000000000000000..5d0203d48d5cff2f6ecf3530c374c6a20eacc11c --- /dev/null +++ b/SensitiveWordsFilterCheckerUtil.java @@ -0,0 +1,137 @@ +package com.huawei.ci.portal.provider.utils; + +import com.huawei.ci.portal.provider.enums.assistanthelper.AssistantConstants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; + +import java.util.*; + +/** + * @Description: 敏感词过滤工具,过滤不雅词汇 + * @ClassName: com.huawei.ci.portal.provider.utils + * @Author: hwx1123794/hexiangyun + * @DateTime: 2022/7/21 15:54 + * @Params: + **/ +@Component +public class SensitiveWordsFilterCheckerUtil { + + private static final Logger logger = LoggerFactory.getLogger(SensitiveWordsFilterCheckerUtil.class); + + private static final Map sensitiveWordsMap; + + // 最小匹配 + private static final Integer minMatchType = 1; + + // 完全匹配 + private static final Integer maxMatchType = 2; + + static { + sensitiveWordsMap = new ResourcesInitializerUtil().InitializationWorkUtil(); + } + + + /** + * 校验是否包含敏感词 + * + * @param txt 待判断文本 + * @param start 起始位置 + * @param matchType 匹配类型: 1 最小匹配原则;2 最大匹配原则 + * @return 大于0表示包含敏感词且表示敏感词匹配长度,否则不包含 + */ + private static Integer checkIfExistSensitiveWords(String txt, Integer start, Integer matchType) { + Boolean flag = false; + char word; + // 匹配标记位 + Integer matchFlag = 0; + Map childMap = sensitiveWordsMap; + for (int i = start; i < txt.length(); i++) { + word = txt.charAt(i); + childMap = (Map) childMap.get(word); + if (childMap == null) { + break; + } else { + // 匹配标记位+1 + matchFlag++; + // isEnd标记位 = 1时,匹配到了末尾 + if ("1".equals(childMap.get("isEnd"))) { + flag = true; + if (minMatchType.equals(matchType)) { + break; + } + } + } + } + if (matchFlag < 2 || !flag) { + // 匹配长度需大于2才为词,并且敏感词已结束 + matchFlag = 0; + } + return matchFlag; + } + + /** + * 获取所有敏感词 + * + * @param txt 待判断文本 + * @param matchType 匹配类型: 1 最小匹配原则;2 最大匹配原则 + * @return + */ + public static Set getSensitiveWords(String txt, Integer matchType) { + Set sensitiveWords = new HashSet<>(); + for (int i = 0; i < txt.length(); i++) { + // 判断敏感词所在文本内容中的 起始点 + Integer length = checkIfExistSensitiveWords(txt, i, matchType); + if (length > 0) { + sensitiveWords.add(txt.substring(i, i + length)); + // 循环i会+1,所以需-1 + i = i + length - 1; + } + } + return sensitiveWords; + } + + /** + * 替换敏感词 + * + * @param txtOfInput 文本 + * @param matchType 匹配类型: 1 最小匹配原则;2 最大匹配原则 + * @param replaceStr 替换字符 + * @return 处理后的文本 + */ + public String replaceSensitiveWords(String txtOfInput, Integer matchType, String replaceStr) { + if (txtOfInput == null || "".equals(txtOfInput)) { + return txtOfInput; + } + Set sensitiveWords = getSensitiveWords(txtOfInput, matchType); + Iterator iterator = sensitiveWords.iterator(); + String replaceString = ""; + while (iterator.hasNext()) { + String sWord = iterator.next(); + // 替换敏感词内容 + replaceString = getReplaceString(replaceStr, sWord.length()); + txtOfInput = txtOfInput.replaceAll(sWord, replaceString); + } + return txtOfInput; + } + + /** + * 获取需要替换的文本 + * + * @param length 要替换的文本长度 + * @param replaceStr 将要替换的文本符号 + */ + private static String getReplaceString(String replaceStr, Integer length) { + if (replaceStr == null) { + // 使用 * 替换敏感词。 在 AssistantConstants中配置 + replaceStr = AssistantConstants.BE_REPLACE_STR; + } + StringBuffer replaceString = new StringBuffer(); + for (int i = 0; i < length; i++) { + replaceString.append(replaceStr); + } + return replaceString.toString(); + } + +} +