社区项目-敏感词过滤算法

功能

对输入的文字进行敏感词过滤。

算法原型

前缀树(字典树)
字符串 xwabfabcff  敏感词 adc bf be
1.首选敏感词建立一个字典树
2.三个指针:遍历指针,树指针,发现指针
![](https://i.imgur.com/yfBr3kr.jpg)

以上是原型

整合到项目中的实现

@Service
public class SensitiveService implements InitializingBean{
    public final static Logger logger = LoggerFactory.getLogger(SensitiveService.class);
    @Override
    public void afterPropertiesSet() throws Exception {
        try{
            InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("SensitiveWords.txt");
            InputStreamReader read = new InputStreamReader(is);
            BufferedReader bufferedReader = new BufferedReader(read);
            String lineText ;
            while ((lineText=bufferedReader.readLine())!=null){
                addWord(lineText.trim());//把文件中的词建成一个节点树
            }
            read.close();
        }catch (Exception e){
            logger.error("读取敏感词时间失败");

        }
    }
    private TrieNode root = new TrieNode();
    //增加关键词
    private void addWord(String lineText){
        TrieNode temp = root;
        for (int i=0;i<lineText.length();++i){
            Character c = lineText.charAt(i);
            if (isSymbol(c)){
                continue;
            }

            TrieNode node = temp.getSubNode(c);

            if (node == null){
                node = new TrieNode();
                temp.addSubNode(c,node);
            }

            temp = node;

            if (i==lineText.length()-1){
                temp.setKeyWordEnd(true);
            }

        }
    }

    /**
     *
     */
    private class TrieNode{
        //是不是关键词的结尾
        private boolean end = false;
        //当前节点下的所有子节点
        private Map<Character,TrieNode> subNodes= new HashMap<Character,TrieNode>();
        //
        public void addSubNode(Character key,TrieNode trieNode){
            subNodes.put(key,trieNode);
        }

        TrieNode getSubNode(Character key){
            return subNodes.get(key);
        }

        boolean isKeyWordEnd(){
            return end;
        }

        void setKeyWordEnd(Boolean end){
            this.end = end;
        }
    }



    public boolean isSymbol(char c){
        int ic =  (int)c;
        //表示东亚文字
        return !CharUtils.isAsciiAlphanumeric(c)&& (ic<0x2E80 || ic>0x9FFF);
    }
    /**
      *过滤的详细实现
     * @param text
     * @return
     */
    public String filter(String text){
        if (StringUtils.isBlank(text)){
            return text;
        }

        String replacement="这是敏感词,被屏蔽了,对不起了老哥";

        TrieNode tempNode = root;

        int begin = 0;
        int position = 0;
        StringBuilder result = new StringBuilder();

        while (position<text.length()){
            char c = text.charAt(position);

            if (isSymbol(c)){
                if (tempNode ==root){
                    result.append(c);
                    ++begin;
                }
                ++position;
                continue;
            }
            tempNode = tempNode.getSubNode(c);

            if (tempNode==null){
                result.append(text.charAt(begin));
                position = begin+1;
                begin = position;
                tempNode = root;
            }else if (tempNode.isKeyWordEnd()){
                //发现敏感词
                result.append(replacement);
                position = position+1;
                begin = position;
                tempNode = root;
            }else {
                ++position;
            }
        }
        result.append(text.substring(begin));
        return result.toString();
    }
0%