新闻头条国内大事,云南网站优化哪家好,免费一键生成短链接,站长工具seo推广 站长工具查询本文是在之前两篇文章的基础上进行写作的
(1条消息) 【java爬虫】使用selenium爬取优惠券_haohulala的博客-CSDN博客
(1条消息) 【java爬虫】使用selenium获取某宝联盟淘口令_haohulala的博客-CSDN博客
前两篇文章介绍了如何获取优惠券的基础信息#xff0c;本文将获取到的…本文是在之前两篇文章的基础上进行写作的
(1条消息) 【java爬虫】使用selenium爬取优惠券_haohulala的博客-CSDN博客
(1条消息) 【java爬虫】使用selenium获取某宝联盟淘口令_haohulala的博客-CSDN博客
前两篇文章介绍了如何获取优惠券的基础信息本文将获取到的基本信息存到数据库中并且利用数据库的排序获取优惠力度最大的优惠券。这里的优惠力度指的是用全前价格减去券后价格获得的优惠价格绝对值。
相对于之前的文章本文中我优化了爬虫的爬取策略把爬虫放到一个新的线程一页一页不停地获取数据。在爬取的过程中我发现程序会比较脆弱很多情况下都会导致程序崩溃比如某个按钮被遮挡住了selenium就找不到了这个时候就会抛异常。像这类问题只能后面慢慢优化了如果有时间的话。
首先来看一下爬虫线程运行的情况 我们需要存到数据库中的数据比较多更新了实体类
Data
AllArgsConstructor
NoArgsConstructor
public class GoodItem {// 优惠券标题private String title;// 图片urlprivate String imgUrl;// 券前价格private Double prePrice;// 券后价格private Double postPrice;// 佣金率private Double commissionRate;// 佣金private Double commission;// 口令private String recommend;// 创建的时间private String serializeTime;// 优惠价格private Double preferentialPrice;// 优惠率private Double preferentialRate;// 计算优惠价格和优惠率public void calculatePreferentialPriceAndRate() {if(prePricenull || postPricenull) {preferentialPrice 0.0;preferentialRate 0.0;return;}preferentialPrice prePrice - postPrice;preferentialRate preferentialPrice / prePrice;// 保留四位小数preferentialPrice Double.parseDouble(String.format(%.4f, preferentialPrice));preferentialRate Double.parseDouble(String.format(%.4f, preferentialRate));}}
从上述代码中可以看出来优惠价格和优惠率是自己计算的在获取完券前价格和券后价格后就可以计算着两个值了最后的结果保留四位有效数字这里只是用了格式化保留四位小数并没有进行四舍五入。
爬虫程序与之前的不同就是每当获取一个完整的数据后就存到数据库中然后一页一页不停地获取数据。
Slf4j
Service
public class SeleinumServiceImpl implements SeleinumService {private final String DRIVER_PATH E:/写作/优惠券项目/驱动/chromedriver.exe;Overridepublic void startSelenium() {// 实例化BrowserMob代理System.setProperty(webdriver.chrome.driver, DRIVER_PATH);BrowserMobProxy browserMobProxy new BrowserMobProxyServer();browserMobProxy.start();browserMobProxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);browserMobProxy.setHarCaptureTypes(CaptureType.RESPONSE_CONTENT);browserMobProxy.newHar(kk);Proxy seleniumProxy ClientUtil.createSeleniumProxy(browserMobProxy);// 实例化SeleniumChromeOptions options new ChromeOptions();options.setProxy(seleniumProxy);options.setAcceptInsecureCerts(true);//options.setExperimentalOption(debuggerAddress, 127.0.0.1:9222);WebDriver driver new ChromeDriver(options);// 网络请求回调函数browserMobProxy.addRequestFilter(new RequestFilter() {Overridepublic HttpResponse filterRequest(HttpRequest httpRequest, HttpMessageContents httpMessageContents, HttpMessageInfo httpMessageInfo) {// 打印请求信息// log.info(request httpMessageInfo.getUrl());return null;}});// 网络响应回调函数browserMobProxy.addResponseFilter(new ResponseFilter() {Overridepublic void filterResponse(HttpResponse httpResponse, HttpMessageContents httpMessageContents, HttpMessageInfo httpMessageInfo) {// 这里获取打印的信息log.info(httpMessageInfo.getUrl());if(httpMessageInfo.getUrl().equals(https://pub.alimama.com/openapi/param2/1/gateway.unionpub/union.pub.entry)) {// 格式化输出String str JSONObject.toJSONString(httpMessageContents.getTextContents(), true);System.out.println(str);// 将数据写到文件中try {FileWriter writer new FileWriter(output.txt);writer.write(str);} catch (IOException e) {e.printStackTrace();}}}});// 打开网页driver.get(https://pub.alimama.com/portal/v2/pages/promo/goods/index.htm?pageNum2);}Overridepublic ListGoodItem getGoodInfo() {// 加载chrome驱动System.setProperty(webdriver.chrome.driver, DRIVER_PATH);ChromeOptions options new ChromeOptions();options.setExperimentalOption(debuggerAddress, 127.0.0.1:9222);// 启动浏览器WebDriver driver new ChromeDriver(options);// 设置最长等待时间driver.manage().timeouts().implicitlyWait(5, TimeUnit.SECONDS);// 实例化一个列表存放数据ListGoodItem rstList new ArrayList();// 开始遍历卡片数据// 遍历100组数据暂停for(int i0; i100; ) {WebElement element driver.findElement(By.className(GoodsList__CardList-sc-84so0w-1));ListWebElement divList element.findElements(By.className(union-good-card-wrap));log.info(获取 divList.size() 个优惠券卡片);for(int j0; j divList.size(); j) {GoodItem item new GoodItem();// 图片urlitem.setImgUrl(divList.get(j).findElement(By.className(union-good-card-good-img-wrap-mediumn)).findElement(By.tagName(a)).findElement(By.tagName(img)).getDomAttribute(src));// 优惠券标题item.setTitle(divList.get(j).findElement(By.className(union-good-card-title)).findElement(By.tagName(span)).getText());// 券前价格item.setPrePrice(getPrice(divList.get(j).findElement(By.className(union-good-card-coupon-reserve-price-mediumn))));// 券后价格item.setPostPrice(getPrice(divList.get(j).findElement(By.className(union-good-card-coupon-final-price))));ListWebElement commissionList divList.get(j).findElements(By.className(union-good-card-commision-info-item));// 佣金率item.setCommissionRate(getPrice(commissionList.get(0)));// 佣金item.setCommission(getPrice(commissionList.get(1)));log.info(JSON.toJSONString(item));i;if(i 100) {log.info(100条数据获取完毕);return rstList;}}// 切换到下一页driver.findElement(By.className(GoodsList__Pagination-sc-84so0w-2)).findElement(By.className(mux-pagination-icon-next)).click();log.info(进入到下一页);}return rstList;}// 获取券前券后价格private Double getPrice(WebElement element) {StringBuilder sb new StringBuilder();sb.append(element.findElement(By.className(union-number-format-integer)).getText().replaceAll(,, ));sb.append(element.findElement(By.className(union-number-format-pointer)).getText());sb.append(element.findElement(By.className(union-number-format-decimal)).getText());Double price Double.parseDouble(sb.toString());return price;}}
数据库部分我们简单建一个表存数据然后再建一些索引方便去重和加速查询
use coupon_db;/* 文章信息表 */
drop table if exists t_coupon;
create table t_coupon(u_id bigint(20) unsigned NOT NULL AUTO_INCREMENT PRIMARY KEY COMMENT 优惠券id,title varchar(255) NOT NULL COMMENT 优惠券标题,img_url varchar(500) NOT NULL COMMENT 图片的url,pre_price double NOT NULL COMMENT 券前价格,post_price double NOT NULL COMMENT 券后价格,preferential_price double NOT NULL COMMENT 优惠价格,preferential_rate double NOT NULL COMMENT 优惠率,commission_rate double NOT NULL COMMENT 佣金率,commission double NOT NULL COMMENT 佣金,recommend varchar(500) NOT NULL COMMENT 淘口令,serialize_time varchar(50) NOT NULL COMMENT 创建的时间
) ENGINEInnoDB COMMENT 优惠券信息表;
/* 创建标题唯一索引通过标题判断优惠券的唯一性 */
create unique index title_index on t_coupon (title);
/* 创建普通索引用于加速查询 */
create index preferential_price_index on t_coupon (preferential_price);
create index serialize_time_index on t_coupon (serialize_time);
create index commission_index on t_coupon (commission);
create index commission_rate_index on t_coupon (commission_rate);
本文就简单写三个mapper接口来存数据和查数据
Mapper
public interface GoodMapper {// 清空表public void clearTable();// 插入一条数据public void insertOneItem(Param(item) GoodItem item);// 查询数据按照优惠价格降序排序public ListGoodItem selectByOrder(Param(start) int start,Param(num) int num);
}?xml version1.0 encodingUTF-8 ?
!DOCTYPE mapperPUBLIC -//mybatis.org//DTD Mapper 3.0//ENhttp://mybatis.org/dtd/mybatis-3-mapper.dtd
mapper namespacecom.example.demo.mapper.GoodMapperdelete idclearTabledelete from t_coupon where 11/deleteinsert idinsertOneItem parameterTypeGoodIteminsert into t_coupon(title, img_url, pre_price, post_price,commission_rate, commission, recommend, serialize_time,preferential_price, preferential_rate) values(#{item.title}, #{item.imgUrl}, #{item.prePrice}, #{item.postPrice},#{item.commissionRate}, #{item.commission}, #{item.recommend}, #{item.serializeTime},#{item.preferentialPrice}, #{item.preferentialRate})on duplicate key update titletitle/insertselect idselectByOrder resultTypeGoodItemselect * from t_couponorder by preferential_price DESClimit #{start}, #{num}/select/mapper
我们将开启爬虫和查询的接口都写在一个Controller里面
Controller
public class BootController {Autowiredprivate SeleniumPlusService seleniumPlusService;Autowiredprivate SelectService selectService;RequestMapping(/bootstart)ResponseBodypublic String bootstart() {// 创建一个线程去爬取优惠券数据new Thread(()-{seleniumPlusService.startSpider();}).start();return success;}// 查询数据RequestMapping(/select/{start}/{num})ResponseBodypublic String selectByOrderDESC(PathVariable(start) int start,PathVariable(num) int num) {ListGoodItem goodItemList selectService.selectByOrderDESC(start, num);return JSON.toJSONString(goodItemList);}}其中查询的Service非常简单就是执行mapper中的查询接口然后将数据返回
Service
public class SelectServiceImpl implements SelectService {Autowiredprivate GoodMapper goodMapper;Overridepublic ListGoodItem selectByOrderDESC(int start, int num) {return goodMapper.selectByOrder(start, end);}
}查询接口是一个GET请求请求参数是按照优惠价格降序排序后的数据。
比如下图执行的请求接口是 localhost:8080/select/0/10 查询数据库里面优惠价格最高的10条数据 上述的查询对应的sql语句如下。
在SQL语句中 limit 后面的两个参数分别是开始的索引和查询的数据量比如下面这条SQL语句的意思应该是从第0条数据开始查一共查出10条数据。
select * from t_coupon
order by preferential_price DESC
limit 0, 10
查询出来的数据如下
[{commission: 2.64,commissionRate: 5.3,imgUrl: //img.alicdn.com/bao/uploaded/i3/6000000002126/O1CN01Z5K9L61RZktaDO7mX_!!6000000002126-0-sm.jpg,postPrice: 49.9,prePrice: 99.0,preferentialPrice: 49.1,preferentialRate: 0.496,recommend: 立白天然茶籽洗衣液6KG 家庭实惠洗衣除菌除螨 89.00元\n本月上新\n买它超值\nh:/89 CZ3457 tptfdGVZ6Wj\n,serializeTime: 2023-07-16 13:16:28,title: 立白天然茶籽洗衣液6KG 家庭实惠洗衣除菌除螨
}, {commission: 7.11,commissionRate: 9.0,imgUrl: //img.alicdn.com/bao/uploaded/O1CN010BdQ3w1eaIJCsydxO_!!6000000003887-0-yinhe.jpg,postPrice: 49.5,prePrice: 89.0,preferentialPrice: 39.5,preferentialRate: 0.4438,recommend: Joocyee酵色唇釉琥珀唇彩丝绒口红哑光贝壳镜面唇泥太妃糖复古女 79.00元\n历史热推\n速速抢购手快有手慢无\nh:/68 CZ0001 30W9dGVaz77\n,serializeTime: 2023-07-16 13:16:41,title: Joocyee酵色唇釉琥珀唇彩丝绒口红哑光贝壳镜面唇泥太妃糖复古女
}, {commission: 0.67,commissionRate: 1.35,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01oDxvso1iw3dVd7jec_!!6000000004476-0-yinhe.jpg,postPrice: 49.9,prePrice: 84.9,preferentialPrice: 35.0,preferentialRate: 0.4122,recommend: 蒙牛特仑苏纯牛奶250ml*16盒整箱学生早餐奶高端新旧包装混发 82.90元\n超十万人正在疯抢\n喜欢的宝宝们千万不要错过哦~赶紧买起来买起来~\nh:/49 CZ3457 kbB6dGeADLs\n,serializeTime: 2023-07-16 13:16:01,title: 蒙牛特仑苏纯牛奶250ml*16盒整箱学生早餐奶高端新旧包装混发
}, {commission: 0.9,commissionRate: 1.5,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01hCQXtN1Oc5yax1WHF_!!6000000001725-0-yinhe.jpg,postPrice: 46.57,prePrice: 79.9,preferentialPrice: 33.33,preferentialRate: 0.4171,recommend: 【零感003】杰士邦避孕套正品官方旗舰店安全套超薄男用裸入持久1 59.90元\n赠运费险\n买它就对了\nh:/77 CZ3457 Y5i7dGVZj30\n,serializeTime: 2023-07-16 13:16:33,title: 【零感003】杰士邦避孕套正品官方旗舰店安全套超薄男用裸入持久1
}, {commission: 0.33,commissionRate: 0.3,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01BvRfuK22sf0fmsdtn_!!6000000007176-0-yinhe.jpg,postPrice: 109.0,prePrice: 139.0,preferentialPrice: 30.0,preferentialRate: 0.2158,recommend: 罗马仕20000毫安充电宝双自带线双向快充大容量1万小巧移动电源闪充适用华为苹果iphone小米oppo手机专用户外 109.00元\n超十万人正在疯抢\n快~少量库存赶紧冲拼手速\nh:/19 CZ0001 0GK2dGVayGy\n,serializeTime: 2023-07-16 13:16:24,title: 罗马仕20000毫安充电宝双自带线双向快充大容量1万小巧移动电源闪充适用华为苹果iphone小米oppo手机专用户外
}, {commission: 1.6,commissionRate: 1.8,imgUrl: //img.alicdn.com/bao/uploaded/i1/2200828292428/O1CN01f5SKRV1To4V1gBrc1_!!2200828292428.jpg,postPrice: 89.0,prePrice: 118.0,preferentialPrice: 29.0,preferentialRate: 0.2458,recommend: 逐本清欢晨蜜自在自然植萃卸妆油敏弱肌脸部舒缓深层清洁卸妆水膏 89.00元\n回头客1万\n质量逆天赶紧的闭眼买都不亏\nh:/59 CZ0001 JrpUdGVafyH\n,serializeTime: 2023-07-16 13:16:06,title: 逐本清欢晨蜜自在自然植萃卸妆油敏弱肌脸部舒缓深层清洁卸妆水膏
}, {commission: 0.69,commissionRate: 1.35,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01ry5fh31G8llXLIPuR_!!6000000000578-0-yinhe.jpg,postPrice: 50.9,prePrice: 74.4,preferentialPrice: 23.5,preferentialRate: 0.3159,recommend: 蒙牛纯牛奶全脂灭菌乳250ml*24盒/1箱学生营养早餐搭配优质乳蛋白 71.40元\n回头客12万\n买它就对了\nh:/97 CZ3457 nlVhdGVZUPV\n,serializeTime: 2023-07-16 13:16:10,title: 蒙牛纯牛奶全脂灭菌乳250ml*24盒/1箱学生营养早餐搭配优质乳蛋白
}, {commission: 2.21,commissionRate: 4.5,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01gO1IfQ1ljdhW0a0LT_!!6000000004855-0-yinhe.jpg,postPrice: 49.0,prePrice: 69.0,preferentialPrice: 20.0,preferentialRate: 0.2899,recommend: 蕉下修容口罩EM320 防晒护眼角开车面罩防紫外线立体夏女透气户外 49.00元\n好评过万\n不要犹豫库存不多抓紧抢\nh:/59 CZ0001 LYJSdGVZeik\n,serializeTime: 2023-07-16 13:16:37,title: 蕉下修容口罩EM320 防晒护眼角开车面罩防紫外线立体夏女透气户外
}, {commission: 3.05,commissionRate: 10.5,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01siAhJN1Hwyo2vfWAr_!!6000000000823-0-yinhe.jpg,postPrice: 29.0,prePrice: 49.0,preferentialPrice: 20.0,preferentialRate: 0.4082,recommend: 猫人抗菌裆男士内裤男冰丝无痕四角裤运动纯棉裆平角大码裤青少年 29.00元\n回头客2万\n质量逆天赶紧的闭眼买都不亏\nh:/17 CZ0001 ZxlhdGVaNAb\n,serializeTime: 2023-07-16 13:16:51,title: 猫人抗菌裆男士内裤男冰丝无痕四角裤运动纯棉裆平角大码裤青少年
}, {commission: 1.79,commissionRate: 6.0,imgUrl: //img.alicdn.com/bao/uploaded/O1CN01PyEz521NEuMNqT3Av_!!6000000001539-0-yinhe.jpg,postPrice: 29.9,prePrice: 49.9,preferentialPrice: 20.0,preferentialRate: 0.4008,recommend: 【百亿补贴】进口茱蒂丝巧克力夹心饼干纯可可脂儿童健康休闲零食 29.90元\n近7天浏览过万\n这价位能做到这样真的无可挑剔\nh:/98 CZ0001 eJUpdGVZvWR\n,serializeTime: 2023-07-16 13:17:51,title: 【百亿补贴】进口茱蒂丝巧克力夹心饼干纯可可脂儿童健康休闲零食
}]
可以看到有一些优惠券的优惠力度还是挺大的可以省几十元就是不知道这个券前价格准不准哈哈哈。