第一次新闻爬虫实验

附代码

“Wangyiming”

Jul 22, 2020 4 min read front-end

这学期web编程课，第一个实验项目是新闻网站的爬虫。第一次接触，还很生疏，中间也有很多困难。

具体要求

1、选取3-5个代表性的新闻网站（比如新浪新闻、网易新闻等，或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等，或者体育领域的腾讯体育、虎扑体育等等）建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。 2、建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。技术要求： 1、必须采用Node.JS实现网络爬虫 2、必须采用Node.JS实现查询网站后端，HTML+JS实现前端（尽量不要使用任何前后端框架）

前期准备

安装node.js和几个模块库安装IDE：vscode

过程

1.爬虫

新闻网站选取了中国青年网的新闻版块http://news.youth.cn/

var source_name = “中国青年网”; var domain = ‘http://www.news.youth.cn/'; var myEncoding = “GBK”; var seedURL = ‘http://www.news.youth.cn/';

观察网页源代码得到

引用模块库

var fs = require(‘fs’); var myRequest = require(‘request’); var myCheerio = require(‘cheerio’); var myIconv = require(‘iconv-lite’); require(‘date-utils’);

老师提供的防止网站屏蔽爬虫的代码

//防止网站屏蔽我们的爬虫 var headers = { ‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36’ }

seedget();

        } catch (e) { console.log('识别种子页面中的新闻链接出错：' + e) }

        if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
        //console.log(myURL);

        var fetch_url_Sql = 'select url from fetches where url=?';
        var fetch_url_Sql_Params = [myURL];
        mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
            if (vals.length > 0) {
                console.log('URL duplicate!')
            } else newsGet(myURL); //读取新闻页面
        });
    });
});

};

    console.log("转码读取成功:" + myURL);
    //动态执行format字符串，构建json对象准备写入文件或数据库
    var fetch = {};
    fetch.title = "";
    fetch.content = "";
    fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
    //fetch.html = myhtml;
    fetch.url = myURL;
    fetch.source_name = source_name;
    fetch.source_encoding = myEncoding; //编码
    fetch.crawltime = new Date();

    if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format);  //没有关键词就用sourcename
    else fetch.keywords = eval(keywords_format);

    if (title_format == "") fetch.title = ""
    else fetch.title = eval(title_format); //标题

    if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期   
    console.log('date: ' + fetch.publish_date);
    fetch.publish_date = regExp.exec(fetch.publish_date)[0];
    fetch.publish_date = fetch.publish_date.replace('年', '-')
    fetch.publish_date = fetch.publish_date.replace('月', '-')
    fetch.publish_date = fetch.publish_date.replace('日', '')
    fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");

    if (author_format == "") fetch.author = source_name; //eval(author_format);  //作者
    else fetch.author = eval(author_format);

    if (content_format == "") fetch.content = "";
    else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定

    if (source_format == "") fetch.source = fetch.source_name;
    else fetch.source = eval(source_format).replace("\r\n", ""); //来源

    if (desc_format == "") fetch.desc = fetch.title;
    else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要    

    // var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
    //     "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
    // ////存储json
    // fs.writeFileSync(filename, JSON.stringify(fetch));

    var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
        'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
    var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
        fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
        fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
    ];

    //执行sql，数据库中fetch表里的url属性是unique的，不会把重复的url内容写入数据库
    mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
        if (qerr) {
            console.log(qerr);
        }
    }); //mysql写入
});

}

引入MySQL

建立mysql数据库

CREATE TABLE fetches ( id_fetches int(11) NOT NULL AUTO_INCREMENT, url varchar(200) DEFAULT NULL, source_name varchar(200) DEFAULT NULL, source_encoding varchar(45) DEFAULT NULL, title varchar(200) DEFAULT NULL, keywords varchar(200) DEFAULT NULL, author varchar(200) DEFAULT NULL, publish_date date DEFAULT NULL, crawltime datetime DEFAULT NULL, content longtext, createtime datetime DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (id_fetches), UNIQUE KEY id_fetches_UNIQUE (id_fetches), UNIQUE KEY url_UNIQUE (url) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;

2.建立网站

前段搭建

后端搭建

})

第一次爬虫项目，觉得很难，很有挑战性，希望下一次能有所进步。

front-end

第一次新闻爬虫实验

具体要求

前期准备

过程

1.爬虫

2.建立网站

王艺鸣