便宜的网站空间,邵阳建设银行网站是多少,建设积分网站,网站如何建设二级域名代理转载自 java使用htmlparser提取网页纯文本例子这篇文章主要介绍了java使用htmlparser提取网页纯文本例子,需要的朋友可以参考下package com.test;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filt…转载自 java使用htmlparser提取网页纯文本例子这篇文章主要介绍了java使用htmlparser提取网页纯文本例子,需要的朋友可以参考下package com.test;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class TestHTMLParser {public static void testHtml() {try {String sCurrentLine;String sTotalString;sCurrentLine ;sTotalString ;java.io.InputStream l_urlStream;java.net.URL l_url new java.net.URL(http://www.ideagrace.com/html/doc/2006/07/04/00929.html);java.net.HttpURLConnection l_connection (java.net.HttpURLConnection) l_url.openConnection();l_connection.connect();l_urlStream l_connection.getInputStream();java.io.BufferedReader l_reader new java.io.BufferedReader(new java.io.InputStreamReader(l_urlStream));while ((sCurrentLine l_reader.readLine()) ! null) {sTotalString sCurrentLine/r/n;// System.out.println(sTotalString);}String testText extractText(sTotalString);System.out.println( testText );} catch (Exception e) {e.printStackTrace();}}public static String extractText(String inputHtml) throws Exception {StringBuffer text new StringBuffer();Parser parser Parser.createParser(new String(inputHtml.getBytes(),GBK), GBK);// 遍历所有的节点NodeList nodes parser.extractAllNodesThatMatch(new NodeFilter() {public boolean accept(Node node) {return true;}});System.out.println(nodes.size()); //打印节点的数量for (int i0;inodes.size();i){Node nodet nodes.elementAt(i);//System.out.println(nodet.getText()); text.append(new String(nodet.toPlainTextString().getBytes(GBK))/r/n); }return text.toString();}public static void test5(String resource) throws Exception {Parser myParser new Parser(resource);myParser.setEncoding(GBK);String filterStr table;NodeFilter filter new TagNameFilter(filterStr);NodeList nodeList myParser.extractAllNodesThatMatch(filter);TableTag tabletag (TableTag) nodeList.elementAt(11);}public static void main(String[] args) throws Exception {// test5(http://www.google.com);testHtml();}
}