京东手机信息爬取（全部手机）

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>&gt.

Norni

547人浏览 · 2020-06-04 15:55:00

Norni · 2020-06-04 15:55:00 发布

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

仅学习参考，不可用于商业用途

version_0

说明：单线程爬虫，使用模块为python自带模块,包括urllib，json等

　　　写这个爬虫是为了熟悉urllib的基本使用，包括常用函数.urllib.build_opener()、urllib.parse.urljoin、urllib.parse.quote、urllib.request.urlopen

　　　urllib.request.install_opener()、http.cookiejar、urllib.request.HTTPHandler()、urllib.request.HTTPCookiesProcessor()

　　　请求频率通过random.uniform()，随机选取

　　　本爬虫目前只支持获取手机页面的信息。

　　　所有的图片信息，以链接方式保存。可以使用urllib.request.urlretrieve()下载。

　　　若要构造多线程爬虫，请参考：https://www.cnblogs.com/nuochengze/p/12861358.html

效果预览：

源码如下：

from urllib import request
from urllib import parse
from urllib import error
from http import cookiejar
import re
from pprint import pprint
import time
import random
import json


class JdPhoneInfo(object):
    def __init__(self,key_word):
        self.key_word = key_word

    def get_url(self,key_word,page_num,page_count):
        url_list = list()
        url_base = "https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&page=2&s=30"
        while page_num<page_count:
            info = {
                "keyword":key_word,
                "page":page_num+1,
                "s":page_num*30,
            }
            url_ = "s_new.php?"+parse.urlencode(info)
            url = parse.urljoin(base=url_base,url=url_)
            url_list.append(url)
            page_num += 1
        return url_list

    def parse_info(self,html_str):
        """获取整页的响应信息，包括page_count,page_current"""
        page_info = dict()
        # 获取页面总数
        page_count = re.compile(r'page_count:\"(.*?)\"',re.S).findall(html_str)
        page_info["page_count"] = int(page_count[0]) if page_count else None
        # 获取页面当页数
        page_current = re.compile(r'page:"(.*?)",page_count',re.S).findall(html_str)
        page_info["page_current"] = int(page_current[0]) if page_count else None
        # 获取所有的产品信息
        page_info["product_list"] = list()
        product_info_list = re.compile(r'class="p-img"(.*?)class="p-icons"', re.S).findall(html_str)
        ## 获取单个产品的信息
        for one_product_info in product_info_list:
            info = dict()
            # 获取标题及链接
            str_ = re.compile(r'p-name p-name-type-2(.*?)</div>',re.S).findall(one_product_info)[0]
            title = re.compile(r'em>(.*?)</em>',re.S).findall(str_)
            info["title"] =re.sub(r'\n|\t|\s|(<.*?>)','',title[0]).strip() if title else None
            href = re.compile(r'href="(.*?)"',re.S).findall(str_)
            info["href"] = "https:"+href[0] if href else None
            # 获取价格
            str_ = re.compile(r'class="p-price"(.*?)</div>',re.S).findall(one_product_info)[0]
            price = re.compile(r'i>(.*?)</i>', re.S).findall(str_)
            info["price"] = price[0] if price else None
            # 获取图片
            info["pic_info"] = list()
            img_list = re.compile(r'class="ps-item">(.*?)</li>',re.S).findall(one_product_info)
            for img in img_list:
                pic_info_ = dict()
                pic_title = re.compile(r'title="(.*?)">',re.S).findall(img)
                pic_info_["pic_title"] = pic_title[0] if pic_title else None
                pic_href = re.compile(r'data-lazy-img="(.*?)"',re.S).findall(img)
                pic_info_["pic_href"] = "https:"+pic_href[0] if pic_href else "---"
                info["pic_info"].append(pic_info_)
            # 获取评价连接
            info["comment_href"] = info["href"]+"#comment"
            # 获取售卖店铺及链接
            info["store"] = dict()
            str_ = re.compile(r'class="p-shop"(.*?)</div>',re.S).findall(one_product_info)[0]
            shop_name = re.compile(r'title="(.*?)"',re.S).findall(str_)
            info["store"]["shop_name"] = shop_name[0] if shop_name else None
            shop_href = re.compile(r'href="(.*?)"', re.S).findall(str_)
            info["store"]["shop_href"] = "https:"+shop_href[0] if shop_href else None
            # 将单个产品添加到产品列表
            page_info["product_list"].append(info)
        return page_info

    def get_request(self,first_url,url=None,url_index_num=None,url_list=None):
        # 构造cookie_handler和https_handler处理器
        cookjar_ = cookiejar.CookieJar()
        cookie_handler = request.HTTPCookieProcessor(cookjar_)
        https_handler = request.HTTPSHandler()
        opener = request.build_opener(cookie_handler, https_handler)
        request.install_opener(opener)
        use_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
        if url_list is not None:
            request_ = request.Request(url=url)
            request_.add_header("User-Agent", use_agent)
            if url_index_num == 0:
                request_.add_header(key="referer",val=first_url)
            else:
                request_.add_header(key="referer",val=url_list[url_index_num-1])
        else:
            # Request实例
            request_ = request.Request(url=first_url)
            # 添加header
            request_.add_header("User-Agent", use_agent)
        response_ = request.urlopen(request_)
        return response_
    
    def save_content(self,info):
        with open("jindong_phone_info.json",'a+',encoding='utf8') as f:
            f.write(json.dumps(info,ensure_ascii=False,indent=2))
            print("当前写入url",info["page_current"])

    def run(self):

        first_url = "https://search.jd.com/Search?keyword={}".format(parse.quote(self.key_word))
        # 获取页面的总页数
        ## 请求第一页
        first_response_html = self.get_request(first_url=first_url).read().decode()
        ## 提取信息
        page_info = self.parse_info(first_response_html)    # page_info接收一个字典
        # 保存内容
        self.save_content(page_info)
        # 获取构造的所有url
        url_list = self.get_url(self.key_word,page_num=page_info["page_current"],page_count=page_info["page_count"])
        for url in url_list:
            response_html = self.get_request(first_url=first_url,url=url,url_list=url_list,url_index_num=url_list.index(url)).read().decode()
            page_info = self.parse_info(response_html)
            # 保存内容
            self.save_content(page_info)
            num = random.uniform(1,2)
            time.sleep(num)
            
    
if __name__=="__main__":
    # key_word = input("请输入关键字:")
    key_word = "手机"
    print("本程序将采集以下信息:标题及连接，价格，图片，评价连接，售卖店铺及链接")
    obj = JdPhoneInfo(key_word)
    obj.run()

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

快递鸟一站式物流API解决方案

电商企业物流数字化转型必备！快递鸟 API 接口，72 小时快速完成物流系统集成。全流程实战1V1指导，营造开放的API技术生态圈。

更多推荐

苹方字体跨平台解决方案：告别Windows与Mac的字体显示鸿沟

在Web开发中，我们经常面临一个令人头疼的问题：精心设计的页面在Mac上优雅精致，到了Windows设备上却因字体差异而显得平庸。今天，我们为您介绍一个专业的解决方案——PingFangSC字体包，它让苹方字体的优雅设计能够在所有平台上完美呈现。这个开源项目提供了完整的6种字重，支持ttf和woff2双格式，真正实现了跨平台字体统一。## 为什么跨平台字体一致性如此重要？🔍现代Web应用

快递鸟社区

Ascend-SACT/Mineru-Optimization后端引擎对比：Pipeline、Hybrid与VLM模式如何选择？

Ascend-SACT/Mineru-Optimization提供三种强大的后端引擎模式——Pipeline、Hybrid和VLM，帮助用户高效处理各类文档。本文将深入对比这三种模式的核心特性、性能表现和适用场景，助你快速找到最适合的解决方案。## 三大引擎模式核心特性解析 🚀### Pipeline模式：传统OCR流程的极致优化**核心架构**：采用模块化设计，包含版面分析、OCR、

快递鸟社区

如何永久保存微信聊天记录？WeChatMsg免费开源工具终极指南

你是否曾担心更换手机后，那些珍贵的微信对话会永远消失？与家人的温馨聊天、重要的工作沟通、朋友间的难忘回忆，这些数字记忆都值得被永久珍藏。**WeChatMsg**是一款完全免费的开源工具，专门用于**微信聊天记录永久保存和深度分析**，让你的每一段对话都能成为永恒的数字资产。## 🔍 你的聊天记录正在面临什么风险？微信已经成为我们日常生活中不可或缺的沟通工具，但官方并未提供完整的聊天记录