博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
java网络爬虫爬取百度新闻
阅读量:6246 次
发布时间:2019-06-22

本文共 7293 字,大约阅读时间需要 24 分钟。

采用commons-httpclient

commons-httpclient是一个遗留版本,现在官方已经不推荐使用了。

lucene采用4.3版本

所需jar包

wKiom1b0E3uz0lXjAAAyFb6bkVY732.png

package com.lulei.util;import java.io.BufferedReader;import java.io.ByteArrayInputStream;import java.io.File;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;import org.apache.commons.httpclient.Header;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.HttpStatus;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.log4j.Logger;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.NRTManager;import org.apache.lucene.search.NRTManager.TrackingIndexWriter;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.SearcherFactory;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.NIOFSDirectory;import org.apache.lucene.util.Version;import org.junit.Test;public class MyCrawl {    private static int maxConnectTimes = 3;    private static HttpClient httpClient = new HttpClient();    private static Logger log = Logger.getLogger(MyCrawl.class);    private static Header[] responseHeaders = null;    private static String pageSourceCode = "";    // 网页默认编码方式    private static String charsetName = "iso-8859-1";    // 正则匹配需要看网页的源码,firebug看的不行    // 爬虫+建立索引    public static void main(String[] args) {        String urlSeed = "http://news.baidu.com/n?cmd=4&class=sportnews&pn=1&from=tab";        HashMap
 params = new HashMap
();        params.put("Referer", "http://www.baidu.com");        params.put(                "User-Agent",                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");        GetMethod getMethod = new GetMethod(urlSeed);        Iterator iter = params.entrySet().iterator();        while (iter.hasNext()) {            Map.Entry entry = (Map.Entry) iter.next();            String key = (String) entry.getKey();            String val = (String) entry.getValue();            getMethod.setRequestHeader(key, val);        }        // 得到网页源码放到pageSourceCode变量中        try {            readPage(getMethod, "utf-8", urlSeed);        } catch (Exception e) {            e.printStackTrace();        }        System.out.println(pageSourceCode);        String regexStr = "•
 0) {            try {                if (httpClient.executeMethod(method) != HttpStatus.SC_OK) {                    log.error("can not connect " + urlStr + "\t"                            + (maxConnectTimes - n + 1) + "\t"                            + httpClient.executeMethod(method));                    n--;                } else {                    // 获取头信息                    responseHeaders = method.getResponseHeaders();                    // 获取页面源代码                    InputStream inputStream = method.getResponseBodyAsStream();                    BufferedReader bufferedReader = new BufferedReader(                            new InputStreamReader(inputStream, charsetName));                    StringBuffer stringBuffer = new StringBuffer();                    String lineString = null;                    while ((lineString = bufferedReader.readLine()) != null) {                        stringBuffer.append(lineString);                        stringBuffer.append("\n");                    }                    pageSourceCode = stringBuffer.toString();                    InputStream in = new ByteArrayInputStream(                            pageSourceCode.getBytes(charsetName));                    String charset = CharsetUtil.getStreamCharset(in,                            defaultCharset);                    // 下面这个判断是为了IP归属地查询特意加上去的                    if ("Big5".equals(charset)) {                        charset = "gbk";                    }                    if (!charsetName.toLowerCase()                            .equals(charset.toLowerCase())) {                        pageSourceCode = new String(                                pageSourceCode.getBytes(charsetName), charset);                    }                    return true;                }            } catch (Exception e) {                e.printStackTrace();                System.out.println(urlStr + " -- can't connect  "                        + (maxConnectTimes - n + 1));                n--;            }        }        return false;    }    // 实时搜索    @Test    public void search() {        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(                Version.LUCENE_43, analyzer);        indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);        String indexFile = "D:/index/knnik";        Directory directory = null;        try {            directory = NIOFSDirectory.open(new File(indexFile));            // 创建索引            IndexWriter indexWriter = new IndexWriter(directory,                    indexWriterConfig);            TrackingIndexWriter trackingIndexWriter = new TrackingIndexWriter(                    indexWriter);            NRTManager nrtManager = new NRTManager(trackingIndexWriter,                    new SearcherFactory());            // 查询索引            IndexSearcher indexSearch = nrtManager.acquire();            /*             * //一般的获取indexSearch的方法,非实时 IndexReader             * indexReader=DirectoryReader.open(directory);             *              * IndexSearcher indexSearch=new IndexSearcher(indexReader);             */            Term term = new Term("content", "我们");            Query query = new TermQuery(term);            TopDocs topDocs = indexSearch.search(query, 10);            System.out.println("--------查询结果总数------");            int totalHits = topDocs.totalHits;            System.out.println("totalHits" + ":" + totalHits);            for (ScoreDoc scoreDoc : topDocs.scoreDocs) {                // scoreDoc.doc获取的docID                int docId = scoreDoc.doc;                System.out.println("docId:" + docId);                Document document = indexSearch.doc(docId);                System.out.println(document.get("id"));                System.out.println(document.get("title"));                System.out.println(document.get("content"));                System.out.println(document.get("url"));            }            nrtManager.release(indexSearch);            nrtManager.close();        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }    }}

代码github托管地址:

参考文章:

本文出自 “” 博客,请务必保留此出处

转载地址:http://sdmia.baihongyu.com/

你可能感兴趣的文章
C# 使用 Windows API 发送文件到打印机
查看>>
NOIP2013 D1T3 货车运输 倍增LCA OR 并查集按秩合并
查看>>
80端口被NT kernel & System 占用pid 4
查看>>
mat工具MemoryAnalyzer进行分析java内存溢出hprof文件
查看>>
完整性约束
查看>>
Django之restframework
查看>>
P3924 康娜的线段树
查看>>
Vue的安装和语法
查看>>
验证表单必须为数字并且只保留小数点后2位
查看>>
2-sat基础题 uvalive 3211
查看>>
Elasticsearch5.2.0部署过程的坑
查看>>
go build 不同系统下的可执行文件
查看>>
浏览器版本信息判断整理
查看>>
【我的Android进阶之旅】解决Android Studio 运行gradle命令时报错: 错误: 编码GBK的不可映射字符...
查看>>
windows 下解决 Time_Wait 和 CLOSE_WAIT 方法
查看>>
SOUI Editor使用教程
查看>>
PHP字符串的替换(preg_replace)
查看>>
责任链模式的具体应用
查看>>
Nginx安装
查看>>
Aix下查看内存命令
查看>>