做网站大流量,天津微网站建设,wordpress提示密码不对,做动图素材网站1、首先在pom中引入ansj_seg和nlp-lang的依赖包#xff0c; ansj_seg包的作用#xff1a; 这是一个基于n-GramCRFHMM的中文分词的java实现#xff1b; 分词速度达到每秒钟大约200万字左右#xff08;mac air下测试#xff09;#xff0c;准确率能达到96%以上; 目前实现了…1、首先在pom中引入ansj_seg和nlp-lang的依赖包 ansj_seg包的作用 这是一个基于n-GramCRFHMM的中文分词的java实现 分词速度达到每秒钟大约200万字左右mac air下测试准确率能达到96%以上; 目前实现了.中文分词. 中文姓名识别 . 用户自定义词典,关键字提取自动摘要关键字标记等功能; 可以应用到自然语言处理等方面,适用于对分词效果要求高的各种项目; nlp-lang包的作用(nlp常用工具和组件) 工具词语标准化、tire树结构、双数组tire树、文本断句、html标签清理、Viterbi算法增加 组件汉字转拼音、简繁体转换、bloomfilter、指纹去重、SimHash文章相似度计算、词贡献统计、基于内存的搜索提示、WordWeight词频统计,词idf统计,词类别相关度统计
Maven
!-- nlp-lang --
dependencygroupIdorg.nlpcn/groupIdartifactIdnlp-lang/artifactIdversion1.7.2/version
/dependency
!-- ansj_seg --
dependencygroupIdorg.ansj/groupIdartifactIdansj_seg/artifactIdversion5.1.2/version
/dependency
2、创建WordUtil类如下
package com.mengyao.nlp.util;import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;import org.ansj.app.keyword.KeyWordComputer;
import org.ansj.app.keyword.Keyword;
import org.ansj.app.summary.SummaryComputer;
import org.ansj.app.summary.pojo.Summary;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.commons.lang3.StringUtils;
import org.nlpcn.commons.lang.jianfan.JianFan;
import org.nlpcn.commons.lang.pinyin.Pinyin;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.WordWeight;/*** * author mengyao**/
public class WordUtil {public static void main(String[] args) {System.out.println(2016/06/25.matches(^\\d{4}(\\-|\\/|\\.)\\d{1,2}\\1\\d{1,2}$));System.out.println(20160625.matches(^\\d{8}$));}/*** 文章摘要* param title* param content* return*/public static String getSummary(String title, String content) {SummaryComputer summaryComputer new SummaryComputer(title, content);Summary summary summaryComputer.toSummary();return summary.getSummary();}/*** 带标题的文章关键词提取* param title* param content* return*/public static ListKeyword getKeyWord(String title, String content) {ListKeyword keyWords new ArrayListKeyword();KeyWordComputerNlpAnalysis kwc new KeyWordComputerNlpAnalysis(20);CollectionKeyword result kwc.computeArticleTfidf(title, content);for (Keyword keyword : result) {keyWords.add(keyword);}return keyWords;} /*** 不带标题的文章关键词提取* param content* return*/public static ListKeyword getKeyWord2(String content) {ListKeyword keyWords new ArrayListKeyword();KeyWordComputerNlpAnalysis kwc new KeyWordComputerNlpAnalysis(20);CollectionKeyword result kwc.computeArticleTfidf(content);for (Keyword keyword : result) {keyWords.add(keyword);}return keyWords;} /*** 标准分词* param text* return*/public static ListTerm getToSeg(String text) {ListTerm words new ArrayListTerm();Result parse ToAnalysis.parse(text);for (Term term : parse) {if (null!term.getName()!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** NLP分词* param text* return*/public static ListTerm getNlpSeg(String text) {ListTerm words new ArrayListTerm();Result parse NlpAnalysis.parse(text);for (Term term : parse) {if (null!term.getName()!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** Index分词* param text* return*/public static ListTerm getIndexSeg(String text) {ListTerm words new ArrayListTerm();Result parse IndexAnalysis.parse(text);for (Term term : parse) {if (null!term.getName()!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** 简体转繁体* param word* return*/public static String jian2fan(String text) {return JianFan.j2f(text);}/*** 繁体转简体* param word* return*/public static String fan2jian(String text) {return JianFan.f2j(text);}/*** 拼音(不带音标)* param word* return*/public static String pinyin(String text) {StringBuilder builder new StringBuilder();ListString pinyins Pinyin.pinyin(text);for (String pinyin : pinyins) {if (null ! pinyin) {builder.append(pinyin ); }}return builder.toString();}/*** 拼音(不带音标首字母大写)* param word* return*/public static String pinyinUp(String text) {StringBuilder builder new StringBuilder();ListString pinyins Pinyin.pinyin(text);for (String pinyin : pinyins) {if (StringUtils.isEmpty(pinyin)) {continue;}builder.append(pinyin.substring(0,1).toUpperCase()pinyin.substring(1));}return builder.toString();}/*** 拼音(带数字音标)* param word* return*/public static String tonePinyin(String text) {StringBuilder builder new StringBuilder();ListString pinyins Pinyin.tonePinyin(text);for (String pinyin : pinyins) {if (null ! pinyin) {builder.append(pinyin ); }}return builder.toString();}/*** 拼音(带符号音标)* param word* return*/public static String unicodePinyin(String text) {StringBuilder builder new StringBuilder();ListString pinyins Pinyin.unicodePinyin(text);for (String pinyin : pinyins) {if (null ! pinyin) {builder.append(pinyin ); }}return builder.toString();}/*** 词频统计* param words* return*/public static MapString, Double wordCount(ListString words) {WordWeight ww new WordWeight();for (String word : words) {ww.add(word);}return ww.export();}/*** 词频统计* param words* return*/public static ListString wordCount1(ListString words) {ListString wcs new ArrayListString();WordWeight ww new WordWeight();for (String word : words) {ww.add(word);}MapString, Double export ww.export();for (EntryString, Double entry : export.entrySet()) {wcs.add(entry.getKey():entry.getValue());}return wcs;}/*** 语种识别:1英文0中文* param words* return*/public static int language(String word) {return WordAlert.isEnglish(word)?1:0;}}