跳至主要內容

爬虫

程序员李某某大约 18 分钟

爬虫

urllib

request

  • urlopen(url) --- 打开目标url,用于读取
    • response的类型是http.client.HTTPResponse,常用方法如下
      • read():按字节挨个读、read(num):限制返回的字节数
      • readline():读一行
      • readlines():一行一行读,返回一个列表
      • getcode():状态码
      • geturl():返回url
      • getheads():返回请求头列表
import urllib.request as p

url = 'http://www.baidu.com'
response = p.urlopen(url)

# read方法  返回的是字节形式的二进制数据 --- bytes
# 我们要将二进制的数据转换为字符串
# 二进制--》字符串  解码  decode('编码的格式')
content = response.read().decode('utf-8')

print(content)
  • urlretrieve(url,file_name) --- 下载目标url

  • Request(url=url,headers=headers) --- 定制请求,返回一个urllib.request.Request对象 --- urlopen不能写的请求头

import urllib.request

url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)
  • HTTPHandler() ---- 定制更高级的请求头
# 需求 使用handler来访问百度  获取网页源码

import urllib.request
url = 'http://www.baidu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
request = urllib.request.Request(url = url,headers = headers)

# handler   build_opener  open

# (1)获取hanlder对象
handler = urllib.request.HTTPHandler()
# (2)获取opener对象
opener = urllib.request.build_opener(handler)
# (3) 调用open方法
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
  • ProxyHandler(proxies)
import urllib.request

url = 'http://www.baidu.com/s?wd=ip'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url = url,headers= headers)

# 模拟浏览器访问服务器
# response = urllib.request.urlopen(request)

proxies = {
    'http':'118.24.219.151:16817'
}
# handler  build_opener  open
handler = urllib.request.ProxyHandler(proxies = proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')

代理池

import urllib.request

proxies_pool = [
    {'http':'118.24.219.151:16817'},
    {'http':'118.24.219.151:16817'},
]
import random
proxies = random.choice(proxies_pool)

url = 'http://www.baidu.com/s?wd=ip'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

request = urllib.request.Request(url = url,headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')

parse

  • quote(param) --- unicode编码
import urllib.request
import urllib.parse

url = 'https://www.baidu.com/s?wd='
# 请求对象的定制为了解决反爬的第一种手段
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
# 将周杰伦三个字变成unicode编码的格式
# 我们需要依赖于urllib.parse
name = urllib.parse.quote('周杰伦')
url = url + name

request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
  • urlencode(param) --- 多参数的Unicode编码
base_url = 'https://www.baidu.com/s?'
data = {
    'wd':'周杰伦',
    'sex':'男',
    'location':'中国台湾省'
}

new_data = urllib.parse.urlencode(data)
url = base_url + new_data

post请求必须encode编码,定制request

url = 'https://fanyi.baidu.com/sug'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'kw':'spider'
}
# encode编码
data = urllib.parse.urlencode(data).encode('utf-8')
# 定制request
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8') # 这个结果是Unicode编码,用json反序列化即可

import json

obj = json.loads(content)
print(obj)

示例:百度翻译详细翻译

示例:豆瓣电影

error

异常

  • HTTPError是URLError的子类
  • HTTPError用于表示 HTTP 请求返回的错误状态(404,500等会被捕获)
  • URLError包括无法连接服务器、DNS 错误、超时等情况。与 HTTPError 不同,URLError 并不特定于 HTTP 请求
import urllib.request
import urllib.error

# url = 'https://blog.csdn.net/sulixu/article/details/1198189491'

url = 'http://www.doudan1111.com'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

try:
    request = urllib.request.Request(url = url, headers = headers)

    response = urllib.request.urlopen(request)

    content = response.read().decode('utf-8')

    print(content)
except urllib.error.HTTPError as e:
    print('系统正在升级。。。',e)
except urllib.error.URLError as e:
    print('我都说了 系统正在升级。。。',e)

解析

lxml

xpath解析

  • 本地文件 etree.parse

  • 服务器响应的数据 response.read().decode('utf-8') ----- etree.HTML()

本地

from lxml import etree

# xpath解析本地文件
tree = etree.parse('070_尚硅谷_爬虫_解析_xpath的基本使用.html')
li_list = tree.xpath('xpath路径')

# 判断列表的长度
print(li_list)
print(len(li_list))

网络

import urllib.request

url = 'https://www.baidu.com/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
request = urllib.request.Request(url = url,headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

# 解析网页源码 来获取我们想要的数据
from lxml import etree
# 解析服务器响应的文件
tree = etree.HTML(content)
# 获取想要的数据  xpath的返回值是一个列表类型的数据
result = tree.xpath('//input[@id="su"]/@value')[0]
print(result)

示例:站长素材图片下载

jsonpath

pip install jsonpath
import json
import jsonpath

obj = json.load(open('073_尚硅谷_爬虫_解析_jsonpath.json','r',encoding='utf-8'))
book_list = jsonpath.jsonpath(obj,'jsonpath表达式')
print(book_list)

bs4

pip install bs4

可以用css选择器

from bs4 import BeautifulSoup

# 通过解析本地文件 来将bs4的基础语法进行讲解
# 默认打开的文件的编码格式是gbk 所以在打开文件的时候需要指定编码
soup = BeautifulSoup(open('075_尚硅谷_爬虫_解析_bs4的基本使用.html',encoding='utf-8'),'lxml')

# 根据标签名查找节点
# 找到的是第一个符合条件的数据
# print(soup.a)
# 获取标签的属性和属性值
# print(soup.a.attrs)

# bs4的一些函数
# (1)find
# 返回的是第一个符合条件的数据
# print(soup.find('a'))

# 根据title的值来找到对应的标签对象
# print(soup.find('a',title="a2"))

# 根据class的值来找到对应的标签对象  注意的是class需要添加下划线
# print(soup.find('a',class_="a1"))


# (2)find_all  返回的是一个列表 并且返回了所有的a标签
# print(soup.find_all('a'))

# 如果想获取的是多个标签的数据 那么需要在find_all的参数中添加的是列表的数据
# print(soup.find_all(['a','span']))

# limit的作用是查找前几个数据
# print(soup.find_all('li',limit=2))


# (3)select(推荐)
# select方法返回的是一个列表  并且会返回多个数据
# print(soup.select('a'))

# 可以通过.代表class  我们把这种操作叫做类选择器
# print(soup.select('.a1'))

# print(soup.select('#l1'))


# 属性选择器---通过属性来寻找对应的标签
# 查找到li标签中有id的标签
# print(soup.select('li[id]'))

# 查找到li标签中id为l2的标签
# print(soup.select('li[id="l2"]'))


# 层级选择器
#  后代选择器
# 找到的是div下面的li
# print(soup.select('div li'))

# 子代选择器
#  某标签的第一级子标签
# 注意:很多的计算机编程语言中 如果不加空格不会输出内容  但是在bs4中 不会报错 会显示内容
# print(soup.select('div > ul > li'))


# 找到a标签和li标签的所有的对象
# print(soup.select('a,li'))

# 节点信息
#    获取节点内容
# obj = soup.select('#d1')[0]
# 如果标签对象中 只有内容 那么string和get_text()都可以使用
# 如果标签对象中 除了内容还有标签 那么string就获取不到数据 而get_text()是可以获取数据
# 我们一般情况下  推荐使用get_text()
# print(obj.string)
# print(obj.get_text())

# 节点的属性
# obj = soup.select('#p1')[0]
# name是标签的名字
# print(obj.name)
# 将属性值左右一个字典返回
# print(obj.attrs)

# 获取节点的属性
obj = soup.select('#p1')[0]

print(obj.attrs.get('class'))
print(obj.get('class'))
print(obj['class'])

网络

import urllib.request

url = 'https://www.starbucks.com.cn/menu/'

response = urllib.request.urlopen(url)

content = response.read().decode('utf-8')


from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')

# //ul[@class="grid padded-3 product"]//strong/text()
name_list = soup.select('ul[class="grid padded-3 product"] strong')

for name in name_list:
    print(name.get_text())

selenium

安装

下载浏览器驱动open in new window

pip install selenium
from selenium import webdriver

path = "驱动路径"
browser = webdriver.Chrome(path)
url = "要访问的网址"
browser.get(url)

## 定位

定位

  • find_element_by_id --- button = browser.find_element_by_id('su')
  • find_elements_by_name --- 类名
  • find_elements_by_xpath
  • find_elements_by_tag_name
  • find_elements_by_css_selector
  • find_elements_by_link_text ------- browser.find_element_by_link_text("新闻")

元素信息

  • 属性:.get_attribute('class')
  • 文本:.text
  • 标签名:.tag_name

交互

  • 点击:click()

  • 输入:send_keys()

  • 后退:browser.back()

  • 前进:browser.forword()

  • 截图:browser.save_screenshot('baidu.png')

  • 滚动:

    js='document.documentElement.scrollTop=100000'
    browser.execute_script(js) ## 执行js代码
    
  • 网页代码:page_source

  • 退出:browser.quit()

示例:真实百度搜索

Chrome handless

无界面模式:早期用Phantomjs,现在用

配置

from selenium import webdriver
# 如果需要指定路径,但是路径在新版本中被重构到 Service 函数中了
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


# 浏览器封装
def share_browser ():
  # 对应版本的驱动  http://chromedriver.storage.googleapis.com/index.html
  path = r'C:\Users\ext.liyuanhao3\Documents\worker_node\2023-6\chromedriver.exe'

  # 配置对象
  options = Options()
  # options.add_experimental_option('detach', True) # 不自动关闭浏览器
  options.add_argument('--headless') # 设置无窗口模式
  options.add_argument('--disable-gpu') # 禁用gpu加速

  # 创建浏览器
  service = Service(path)
  browser = webdriver.Chrome(service=service, options=options)
  return browser

调用

# 创建浏览器
browser = share_browser()
# 打开指定网址
browser.get('https://www.baidu.com')
# 保存快照
browser.save_screenshot('baidu.png')
# 退出
browser.quit()

requests

基本和urllib相同,但是更好用,不用编码,定制简单

安装略

与urllib对比

urllibrequests说明
http.client.HTTPResponsemodels.Responseresponse类型
read()、readline()、readlines()text、content获取源码
geturl()urlurl
getheads()headers请求头
getcode()status_code响应状态
encoding编码方式

get请求

  • 参数使用params传递
  • 参数无需urlencode编码
  • 不需要请求对象的定制
  • 请求资源路径中的?可以加也可以不加
import requests

url = 'https://www.baidu.com/s'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'wd':'北京'
}

# url  请求资源路径
# params 参数
# kwargs 字典
response = requests.get(url=url,params=data,headers=headers)

content = response.text

print(content)

post请求

  • post请求 是不需要编
  • post请求的参数是data
  • 不需要请求对象的定制
import requests

url = 'https://fanyi.baidu.com/sug'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

data = {
    'kw': 'eye'
}

# url 请求地址
# data 请求参数
# kwargs 字典
response = requests.post(url=url,data=data,headers=headers)

content =response.text

import json

obj = json.loads(content,encoding='utf-8')
print(obj)

代理

import requests

url = 'http://www.baidu.com/s?'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
}

data = {
    'wd':'ip'
}


proxy = {
    'http':'212.129.251.55:16816'
}

response = requests.get(url = url,params=data,headers = headers,proxies = proxy)

content = response.text

with open('daili.html','w',encoding='utf-8')as fp:
    fp.write(content)

示例:登录、隐藏域参数、验证码

scrapy

爬虫框架

安装

 pip install scrapy
报错1: building 'twisted.test.raiser' extension
        error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++
        Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
解决1
 http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
 Twisted‑20.3.0‑cp37‑cp37m‑win_amd64.whl
 cp是你的python版本
 amd是你的操作系统的版本
 下载完成之后 使用pip install twisted的路径  安装
 切记安装完twisted 再次安装scrapy
报错2  提示python -m pip install --upgrade pip
解决2   运行python -m pip install --upgrade pip
报错3   win32的错误
解决3   pip install pypiwin32
终极方案 anaconda

生成框架

 scrapy startproject test_scrapy
- 项目名称
 - 项目名称
  - spiders
   __init__.py
   自定义的爬虫文件.py
  __init__.py
     items.py  定义数据结构
     middlewares.py 中间件,一般用于配置代理
     pipelines.py 管道,用于并发处理下载数据
     settings.py  配置文件
## 创建爬虫文件
cd test_scrapy/test_scrapy/spiders
scrapy genspider 爬虫名字 网页的域名
import scrapy

class BookSpider(scrapy.Spider):
    name = "book"        ## 运行时使用的名字
    allowed_domains = ["www.douban.com"]  ## 允许的域名
    start_urls = ["https://www.douban.com"]  ## get请求的起始页

    def parse(self, response):     ## 解析数据的回调函数
        pass
  • response.text ‐‐‐》响应的是字符串
  • response.body ‐‐‐》响应的是二进制文件
  • response.xpath()‐》xpath方法的返回值类型是selector列表
    • 调用后的返回值可以再次调用xpath
  • extract() ‐‐‐》提取的是selector对象的是data
  • extract_first() ‐‐‐》提取的是selector列表中的第一个数据

运行

## 在spiders目录下执行
scrapy crawl 爬虫名称

工作原理

  • 引擎向spiders要url
  • 引擎将爬取的url给调度器
  • 调度器将url请求对象放到队列
  • 出队的请求通过引擎交给下载器
  • 下载器发送请求获取互联网数据
  • 下载器将结果返回给引擎
  • 引擎将数据交给spiders,spiders通过xpath解析数据
  • 将解析的结果再交给引擎,若解析的结果是数据就交给管道,如果是url就再交给调度器,重复上述步骤

scrapy shell

交互终端,可以直接调试spiders代码

scrapy shell https://jpt.jd.com/admin/ad/plan-inner-branch

pipelines、item、settings

带有yield的函数是一个生成器,可用于迭代,类似于return,迭代一次就会返回一次

  • settings.py

    ROBOTSTXT_OBEY = True  ## 开启则遵守君子协议
    
    ITEM_PIPELINES = {
       #  管道可以有很多个  那么管道是有优先级的  优先级的范围是1到1000   值越小优先级越高
       'scrapy_dangdang_095.pipelines.ScrapyDangdang095Pipeline': 300,
    
    #    DangDangDownloadPipeline
       'scrapy_dangdang_095.pipelines.DangDangDownloadPipeline':301
    }
    
  • item.py

    import scrapy
    
    
    class TestScrapyItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        src = scrapy.Field()
        price = scrapy.Field()
    
  • pipelines.py

    from itemadapter import ItemAdapter
    
    # 如果想使用管道的话 那么就必须在settings中开启管道
    class TestScrapyPipeline:
    
        # 在爬虫文件开始的之前就执行的一个方法
        def open_spider(self,spider):
            self.fp = open('book.json','w',encoding='utf-8')
    
        # item就是yield后面的book对象
        def process_item(self, item, spider):
            self.fp.write(str(item))
            return item
    
        # 在爬虫文件执行完之后  执行的方法
        def close_spider(self,spider):
            self.fp.close()
    
    import urllib.request
    
    # 多条管道开启
    #    (1) 定义管道类
    #   (2) 在settings中开启管道
    # 'scrapy_dangdang_095.pipelines.DangDangDownloadPipeline':301
    class DangDangDownloadPipeline:
        def process_item(self, item, spider):
            url = 'http:' + item.get('src')
            filename = './books/' + item.get('name') + '.jpg'
            urllib.request.urlretrieve(url = url, filename= filename)
            return item
    
  • book.py

import scrapy
from test_scrapy.items import TestScrapyItem



class BookSpider(scrapy.Spider):
    name = 'book'
    # 如果是多页下载的话 那么必须要调整的是allowed_domains的范围 一般情况下只写域名
    allowed_domains = ['category.dangdang.com']
    start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']


    base_url = 'http://category.dangdang.com/pg'
    page = 1

    def parse(self, response):
#       pipelines 下载数据
#       items     定义数据结构的
#         所有的seletor的对象 都可以再次调用xpath方法
        li_list = response.xpath('//ul[@id="component_59"]/li')

        for li in li_list:
            src = li.xpath('.//img/@data-original').extract_first()
            # 第一张图片和其他的图片的标签的属性是不一样的
            # 第一张图片的src是可以使用的  其他的图片的地址是data-original
            if src:
                src = src
            else:
                src = li.xpath('.//img/@src').extract_first()

            name = li.xpath('.//img/@alt').extract_first()
            price = li.xpath('.//p[@class="price"]/span[1]/text()').extract_first()

            book = ScrapyDangdang095Item(src=src,name=name,price=price)

            # 这里返回给调用者(引擎)一个数据,故交由管道处理
            yield book

        if self.page < 100:
            self.page = self.page + 1

            url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'

#             这里返回调用者一个请求,所以交由调度器到队列中
#             scrapy.Request就是scrpay的get请求
#             url就是请求地址
#             callback是你要执行的那个函数  注意不需要加()
            yield scrapy.Request(url=url,callback=self.parse)

深层获取

class MvSpider(scrapy.Spider):
    name = 'mv'
    allowed_domains = ['www.dytt8.net']
    start_urls = ['https://www.dytt8.net/html/gndy/china/index.html']

    def parse(self, response):
#         要第一个的名字 和 第二页的图片
        a_list = response.xpath('//div[@class="co_content8"]//td[2]//a[2]')

        for a in a_list:
            # 获取第一页的name 和 要点击的链接
            name = a.xpath('./text()').extract_first()
            href = a.xpath('./@href').extract_first()

            # 第二页的地址是
            url = 'https://www.dytt8.net' + href


            # 对第二页的链接发起访问
            yield  scrapy.Request(url=url,callback=self.parse_second,meta={'name':name})

    def parse_second(self,response):
        # 注意 如果拿不到数据的情况下  一定检查你的xpath语法是否正确
        src = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()
        # 接受到请求的那个meta参数的值
        name = response.meta['name']

        movie = ScrapyMovie099Item(src=src,name=name)

        yield movie

CrawlSpider

之前都是拼链接,CrawlSpider可以帮助提取链接

LinkExtractor构造器

  • allow --- 正则
  • restrict_xpaths
  • restrict_css
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from scrapy_readbook_101.items import ScrapyReadbook101Item


class ReadSpider(CrawlSpider):
    name = 'read'
    allowed_domains = ['www.dushu.com']
    start_urls = ['https://www.dushu.com/book/1188_1.html']

    rules = (
        Rule(LinkExtractor(allow=r'/book/1188_\d+.html'), ## 正则匹配
                           callback='parse_item',  ## 回调,只能是函数名的字符串
                           follow=True),    ## 是否跟进,老牛逼了
    )

    def parse_item(self, response):

        img_list = response.xpath('//div[@class="bookslist"]//img')

        for img in img_list:
            name = img.xpath('./@data-original').extract_first()
            src = img.xpath('./@alt').extract_first()

            book = ScrapyReadbook101Item(name=name,src=src)
            yield book

存库

  • 安装pymysql

  • settings.py

# utf-8的杠不允许写
DB_CHARSET = 'utf8'
DB_HOST = '192.168.231.130'
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWROD = '1234'
DB_NAME = 'spider01'
  • pipelines.py
# 加载settings文件
from scrapy.utils.project import get_project_settings
import pymysql


class MysqlPipeline:

    def open_spider(self,spider):
        settings = get_project_settings()
        self.host = settings['DB_HOST']
        self.port =settings['DB_PORT']
        self.user =settings['DB_USER']
        self.password =settings['DB_PASSWROD']
        self.name =settings['DB_NAME']
        self.charset =settings['DB_CHARSET']

        self.connect()

    def connect(self):
        self.conn = pymysql.connect(
                            host=self.host,
                            port=self.port,
                            user=self.user,
                            password=self.password,
                            db=self.name,
                            charset=self.charset
        )

        self.cursor = self.conn.cursor()


    def process_item(self, item, spider):

        sql = 'insert into book(name,src) values("{}","{}")'.format(item['name'],item['src'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()


        return item


    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

日志

  • settings.py
    • LOG_FILE : 将屏幕显示的信息全部记录到文件中,屏幕不再显示,注意文件后缀一定是.log
    • LOG_LEVEL : 设置日志显示的等级,就是显示哪些,不显示哪些
# LOG_LEVEL='WARNING'
LOG_FILE = 'logdemo.log'

post请求

必须有参数,简单的从start_urls中获取不到内容,parse函数也没有意义了,需要重写start_requests方法,主动发送请求

  • url: 要发送的post地址
  • headers:可以定制头信息
  • callback: 回调函数
  • formdata: post所携带的数据,这是一个字典
import scrapy


import json

class TestpostSpider(scrapy.Spider):
    name = 'testpost'
    allowed_domains = ['https://fanyi.baidu.com/sug']
    
    def start_requests(self):
        url = 'https://fanyi.baidu.com/sug'

        data = {
            'kw': 'final'
        }

        yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse_second)

    def parse_second(self,response):

        content = response.text
        obj = json.loads(content,encoding='utf-8')

        print(obj)

代理

## 到settings.py中,打开一个选项
DOWNLOADER_MIDDLEWARES = {
 'postproject.middlewares.Proxy': 543,
}

## 到middlewares.py中写代码
def process_request(self, request, spider):
    request.meta['proxy'] = 'https://113.68.202.10:9999'
    return None

超级鹰打码平台

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
 chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001') #用户中心>>软件ID 生成一个替换 96001
 im = open('a.jpg', 'rb').read()             #本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
 print(chaojiying.PostPic(im, 1902))           #1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

示例

百度翻译详细翻译

import urllib.request
import urllib.parse

url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'

headers = {
    # 'Accept': '*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    # 'Accept-Language': 'zh-CN,zh;q=0.9',
    # 'Connection': 'keep-alive',
    # 'Content-Length': '135',
    # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=DAA8F9F0BD801A2929D96D69CF7EBF50; PSTM=1597202227; BAIDUID=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; __yjs_duid=1_c19765bd685fa6fa12c2853fc392f8db1618999058029; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; BDUSS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDUSS_BFESS=R2bEZvTjFCNHQxdUV-cTZ-MzZrSGxhbUYwSkRkUWk2SkxxS3E2M2lqaFRLUlJoRVFBQUFBJCQAAAAAAAAAAAEAAAA3e~BTveK-9sHLZGF5AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOc7GBTnOxgaW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BAIDUID_BFESS=DAA8F9F0BD801A29B2813502000BF8E9:SL=0:NR=10:FG=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; PSINO=2; H_PS_PSSID=34435_31660_34405_34004_34073_34092_26350_34426_34323_22158_34390; delPer=1; BA_HECTOR=8185a12020018421b61gi6ka20q; BCLID=10943521300863382545; BDSFRCVID=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; BCLID_BFESS=10943521300863382545; BDSFRCVID_BFESS=boDOJexroG0YyvRHKn7hh7zlD_weG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu-EHtdogKK0mOTHv8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF3-44vXP6-hnjy3bRkX4Q4Wpv_Mnndjn6SQh4Wbttf5q3RymJ42-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD-ug3-7qqU5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3-oJqC8hMIt43f; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1629701482,1629702031,1629702343,1629704515; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1629704515; __yjs_st=2_MDBkZDdkNzg4YzYyZGU2NTM5NzBjZmQ0OTZiMWRmZGUxM2QwYzkwZTc2NTZmMmIxNDJkYzk4NzU1ZDUzN2U3Yjc4ZTJmYjE1YTUzMTljYWFkMWUwYmVmZGEzNmZjN2FlY2M3NDAzOThhZTY5NzI0MjVkMmQ0NWU3MWE1YTJmNGE5NDBhYjVlOWY3MTFiMWNjYTVhYWI0YThlMDVjODBkNWU2NjMwMzY2MjFhZDNkMzVhNGMzMGZkMWY2NjU5YzkxMDk3NTEzODJiZWUyMjEyYTk5YzY4ODUyYzNjZTJjMGM5MzhhMWE5YjU3NTM3NWZiOWQxNmU3MDVkODExYzFjN183XzliY2RhYjgz; ab_sr=1.0.1_ZTc2ZDFkMTU5ZTM0ZTM4MWVlNDU2MGEzYTM4MzZiY2I2MDIxNzY1Nzc1OWZjZGNiZWRhYjU5ZjYwZmNjMTE2ZjIzNmQxMTdiMzIzYTgzZjVjMTY0ZjM1YjMwZTdjMjhiNDRmN2QzMjMwNWRhZmUxYTJjZjZhNTViMGM2ODFlYjE5YTlmMWRjZDAwZGFmMDY4ZTFlNGJiZjU5YzE1MGIxN2FiYTU3NDgzZmI4MDdhMDM5NTQ0MjQxNDBiNzdhMDdl',
    # 'Host': 'fanyi.baidu.com',
    # 'Origin': 'https://fanyi.baidu.com',
    # 'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
    # 'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
    # 'sec-ch-ua-mobile': '?0',
    # 'Sec-Fetch-Dest': 'empty',
    # 'Sec-Fetch-Mode': 'cors',
    # 'Sec-Fetch-Site': 'same-origin',
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    # 'X-Requested-With': 'XMLHttpRequest',
}

data = {
    'from': 'en',
    'to': 'zh',
    'query': 'love',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '198772.518981',
    'token': '5483bfa652979b41f9c90d91f3de875d',
    'domain': 'common',
}

data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url = url,data = data,headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')

import json
from pprint import pprint
obj = json.loads(content)
pprint(obj)

豆瓣电影

import urllib.parse
import urllib.request

def create_request(page):
    base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'

    data = {
        'start':(page - 1) * 20,
        'limit':20
    }

    data = urllib.parse.urlencode(data)

    url = base_url + data

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
    }

    request = urllib.request.Request(url=url,headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(page,content):
    with open('douban_' + str(page) + '.json','w',encoding='utf-8')as fp:
        fp.write(content)


# 程序的入口
if __name__ == '__main__':
    start_page = int(input('请输入起始的页码'))
    end_page = int(input('请输入结束的页面'))

    for page in range(start_page,end_page+1):
#         每一页都有自己的请求对象的定制
        request = create_request(page)
#         获取响应的数据
        content = get_content(request)
#         下载
        down_load(page,content)

站长素材图片下载


# (1) 请求对象的定制
# (2)获取网页的源码
# (3)下载


# 需求 下载的前十页的图片
# https://sc.chinaz.com/tupian/qinglvtupian.html   1
# https://sc.chinaz.com/tupian/qinglvtupian_page.html

import urllib.request
from lxml import etree

def create_request(page):
    if(page == 1):
        url = 'https://sc.chinaz.com/tupian/qinglvtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/qinglvtupian_' + str(page) + '.html'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    }

    request = urllib.request.Request(url = url, headers = headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content


def down_load(content):
#     下载图片
    # urllib.request.urlretrieve('图片地址','文件的名字')
    tree = etree.HTML(content)

    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')

    # 一般设计图片的网站都会进行懒加载
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')

    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = 'https:' + src

        urllib.request.urlretrieve(url=url,filename='./loveImg/' + name + '.jpg')




if __name__ == '__main__':
    start_page = int(input('请输入起始页码'))
    end_page = int(input('请输入结束页码'))

    for page in range(start_page,end_page+1):
        # (1) 请求对象的定制
        request = create_request(page)
        # (2)获取网页的源码
        content = get_content(request)
        # (3)下载
        down_load(content)

百度搜索


from selenium import webdriver

# 创建浏览器对象
path = 'chromedriver.exe'
browser = webdriver.Chrome(path)

# url
url = 'https://www.baidu.com'
browser.get(url)

import time
time.sleep(2)

# 获取文本框的对象
input = browser.find_element_by_id('kw')

# 在文本框中输入周杰伦
input.send_keys('周杰伦')

time.sleep(2)

# 获取百度一下的按钮
button = browser.find_element_by_id('su')

# 点击按钮
button.click()

time.sleep(2)

# 滑到底部
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)

time.sleep(2)

# 获取下一页的按钮
next = browser.find_element_by_xpath('//a[@class="n"]')

# 点击下一页
next.click()

time.sleep(2)

# 回到上一页
browser.back()

time.sleep(2)

# 回去
browser.forward()

time.sleep(3)

# 退出
browser.quit()

登录、隐藏域参数、验证码

import requests


# 这是登陆页面的url地址
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}

# 获取页面的源码
response = requests.get(url = url,headers = headers)
content = response.text

# 解析页面源码  然后获取_VIEWSTATE   __VIEWSTATEGENERATOR
from bs4 import BeautifulSoup

soup = BeautifulSoup(content,'lxml')

# 获取_VIEWSTATE
viewstate = soup.select('#__VIEWSTATE')[0].attrs.get('value')

# 获取__VIEWSTATEGENERATOR
viewstategenerator = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')


# 获取验证码图片
code = soup.select('#imgCode')[0].attrs.get('src')
code_url = 'https://so.gushiwen.cn' + code

# 有坑
# import urllib.request
# urllib.request.urlretrieve(url=code_url,filename='code.jpg')
# requests里面有一个方法 session()  通过session的返回值 就能使用请求变成一个对象

session = requests.session()
# 验证码的url的内容
response_code = session.get(code_url)
# 注意此时要使用二进制数据  因为我们要使用的是图片的下载
content_code = response_code.content
# wb的模式就是将二进制数据写入到文件
with open('code.jpg','wb')as fp:
    fp.write(content_code)


# 获取了验证码的图片之后 下载到本地 然后观察验证码  观察之后 然后在控制台输入这个验证码 就可以将这个值给
# code的参数 就可以登陆

code_name = input('请输入你的验证码')


# 点击登陆
url_post = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'

data_post = {
    '__VIEWSTATE': viewstate,
    '__VIEWSTATEGENERATOR': viewstategenerator,
    'from': 'http://so.gushiwen.cn/user/collect.aspx',
    'email': '595165358@qq.com',
    'pwd': 'action',
    'code': code_name,
    'denglu': '登录',
}

response_post = session.post(url = url, headers = headers, data = data_post)

content_post = response_post.text

with open('gushiwen.html','w',encoding= ' utf-8')as fp:
    fp.write(content_post)
上次编辑于:
贡献者: ext.liyuanhao3,李元昊