博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬虫一
阅读量:4605 次
发布时间:2019-06-09

本文共 17012 字,大约阅读时间需要 56 分钟。

初识爬虫

1 #! /usr/bin/env python 2 # encoding: utf-8 3  4 from bs4 import BeautifulSoup 5 import requests 6  7  8 response = requests.get("http://www.autohome.com.cn/news/")   9 # response.text10 response.encoding = response.apparent_encoding  # 解决爬虫乱码11 12 soup = BeautifulSoup(response.text, features="html.parser")  # 生成Soup对象13 soup_obj = soup.find(id="auto-channel-lazyload-article")  # find查找第一个符合条件的对象14 15 li_list = soup_obj.find_all("li")  # find_all查找所有符合的对象,查找出来的值在列表中16 # print(target)17 for i in li_list:18     a = i.find("a")19     if a:20         a_attrs = a.attrs.get("href")  # attrs查找属性21         print(a_attrs)22         a_h = a.find("h3")23         print(a_h)24         img = a.find("img")25         print(img)

requests

Python标准库中提供了:urllib、urllib2、httplib等模块以供Http请求,但是,它的 API 太渣了。它是为另一个时代、另一个互联网所创建的。它需要巨量的工作,甚至包括各种方法覆盖,来完成最简单的任务。

Requests 是使用 Apache2 Licensed 许可证的 基于Python开发的HTTP 库,其在Python内置模块的基础上进行了高度的封装,从而使得Pythoner进行网络请求时,变得美好了许多,使用Requests可以轻而易举的完成浏览器可有的任何操作。

 

requests属性

1 response = requests.get('URL')  2                 response.text  # 获取文本内容3                 response.content  # 获取文本内容,字节4                 response.encoding  # 设置返回结果的编码5                 response.aparent_encoding  # 获取网站原始的编码6                 response.status_code  # 状态码7                 response.cookies.get_dict()  # cookies
1 - 方法关系 2             requests.get(.....) 3             requests.post(.....) 4             requests.put(.....) 5             requests.delete(.....) 6             ... 7              8             requests.request('POST'...) 9         - 参数10             request.request11             - method:  提交方式12             - url:     提交地址13             - params:  在URL中传递的参数,GET 14                 requests.request(15                     method='GET',16                     url= 'http://www.oldboyedu.com',17                     params = {
'k1':'v1','k2':'v2'}18 )19 # http://www.oldboyedu.com?k1=v1&k2=v220 - data: 在请求体里传递的数据21 22 requests.request(23 method='POST',24 url= 'http://www.oldboyedu.com',25 params = {
'k1':'v1','k2':'v2'},26 data = {
'use':'alex','pwd': '123','x':[11,2,3]}27 )28 29 请求头:30 content-type: application/url-form-encod.....31 32 请求体:33 use=alex&pwd=12334 35 36 - json 在请求体里传递的数据37 requests.request(38 method='POST',39 url= 'http://www.oldboyedu.com',40 params = {
'k1':'v1','k2':'v2'},41 json = {
'use':'alex','pwd': '123'}42 )43 请求头:44 content-type: application/json45 46 请求体:47 "{'use':'alex','pwd': '123'}"48 49 PS: 字典中嵌套字典时50 51 - headers 请求头52 53 requests.request(54 method='POST',55 url= 'http://www.oldboyedu.com',56 params = {
'k1':'v1','k2':'v2'},57 json = {
'use':'alex','pwd': '123'},58 headers={59 'Referer': 'http://dig.chouti.com/',60 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"61 }62 )63 - cookies Cookies64 65 66 67 - files 上传文件68 69 - auth 基本认证(headers中加入加密的用户名和密码)70 71 - timeout 请求和响应的超时时间72 73 - allow_redirects 是否允许重定向74 75 - proxies 代理76 77 - verify 是否忽略证书78 79 - cert 证书文件80 81 - stream 流的方式迭代下载82 83 - session: 用于保存客户端历史访问信息

示例

1.GET请求

1 # 1、无参数实例 2    3 import requests 4    5 ret = requests.get('https://github.com/timeline.json') 6    7 print ret.url 8 print ret.text 9   10   11   12 # 2、有参数实例13   14 import requests15   16 payload = {
'key1': 'value1', 'key2': 'value2'}17 ret = requests.get("http://httpbin.org/get", params=payload)18 19 print ret.url20 print ret.text

 

2.POST请求

1 # 1、基本POST实例 2    3 import requests 4    5 payload = {
'key1': 'value1', 'key2': 'value2'} 6 ret = requests.post("http://httpbin.org/post", data=payload) 7 8 print ret.text 9 10 11 # 2、发送请求头和数据实例12 13 import requests14 import json15 16 url = 'https://api.github.com/some/endpoint'17 payload = {
'some': 'data'}18 headers = {
'content-type': 'application/json'}19 20 ret = requests.post(url, data=json.dumps(payload), headers=headers)21 22 print ret.text23 print ret.cookies

3.其他请求

1 requests.get(url, params=None, **kwargs) 2 requests.post(url, data=None, json=None, **kwargs) # post用于创建新数据 3 requests.put(url, data=None, **kwargs)  # put用于更新已存在的数据 4 requests.head(url, **kwargs) 5 requests.delete(url, **kwargs) 6 requests.patch(url, data=None, **kwargs)  # patch是put的补充,用于更新局部数据 7 requests.options(url, **kwargs) options用于请求服务器,你可以使用哪些请求方法,比如你不知道百度能不能用post,通过options,就能获取到可以使用的请求方法 8    9 # 以上方法均是在此方法的基础上构建10 requests.request(method, url, **kwargs)
1 def param_method_url():  2     # requests.request(method='get', url='http://127.0.0.1:8000/test/')  3     # requests.request(method='post', url='http://127.0.0.1:8000/test/')  4     pass  5   6   7 def param_param():  8     # - 可以是字典  9     # - 可以是字符串 10     # - 可以是字节(ascii编码以内) 11  12     # requests.request(method='get', 13     # url='http://127.0.0.1:8000/test/', 14     # params={'k1': 'v1', 'k2': '水电费'}) 15  16     # requests.request(method='get', 17     # url='http://127.0.0.1:8000/test/', 18     # params="k1=v1&k2=水电费&k3=v3&k3=vv3") 19  20     # requests.request(method='get', 21     # url='http://127.0.0.1:8000/test/', 22     # params=bytes("k1=v1&k2=k2&k3=v3&k3=vv3", encoding='utf8')) 23  24     # 错误 25     # requests.request(method='get', 26     # url='http://127.0.0.1:8000/test/', 27     # params=bytes("k1=v1&k2=水电费&k3=v3&k3=vv3", encoding='utf8')) 28     pass 29  30  31 def param_data(): 32     # 可以是字典 33     # 可以是字符串 34     # 可以是字节 35     # 可以是文件对象 36  37     # requests.request(method='POST', 38     # url='http://127.0.0.1:8000/test/', 39     # data={'k1': 'v1', 'k2': '水电费'}) 40  41     # requests.request(method='POST', 42     # url='http://127.0.0.1:8000/test/', 43     # data="k1=v1; k2=v2; k3=v3; k3=v4" 44     # ) 45  46     # requests.request(method='POST', 47     # url='http://127.0.0.1:8000/test/', 48     # data="k1=v1;k2=v2;k3=v3;k3=v4", 49     # headers={'Content-Type': 'application/x-www-form-urlencoded'} 50     # ) 51  52     # requests.request(method='POST', 53     # url='http://127.0.0.1:8000/test/', 54     # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4 55     # headers={'Content-Type': 'application/x-www-form-urlencoded'} 56     # ) 57     pass 58  59  60 def param_json(): 61     # 将json中对应的数据进行序列化成一个字符串,json.dumps(...) 62     # 然后发送到服务器端的body中,并且Content-Type是 {'Content-Type': 'application/json'} 63     requests.request(method='POST', 64                      url='http://127.0.0.1:8000/test/', 65                      json={
'k1': 'v1', 'k2': '水电费'}) 66 67 68 def param_headers(): 69 # 发送请求头到服务器端 70 requests.request(method='POST', 71 url='http://127.0.0.1:8000/test/', 72 json={
'k1': 'v1', 'k2': '水电费'}, 73 headers={
'Content-Type': 'application/x-www-form-urlencoded'} 74 ) 75 76 77 def param_cookies(): 78 # 发送Cookie到服务器端 79 requests.request(method='POST', 80 url='http://127.0.0.1:8000/test/', 81 data={
'k1': 'v1', 'k2': 'v2'}, 82 cookies={
'cook1': 'value1'}, 83 ) 84 # 也可以使用CookieJar(字典形式就是在此基础上封装) 85 from http.cookiejar import CookieJar 86 from http.cookiejar import Cookie 87 88 obj = CookieJar() 89 obj.set_cookie(Cookie(version=0, name='c1', value='v1', port=None, domain='', path='/', secure=False, expires=None, 90 discard=True, comment=None, comment_url=None, rest={
'HttpOnly': None}, rfc2109=False, 91 port_specified=False, domain_specified=False, domain_initial_dot=False, path_specified=False) 92 ) 93 requests.request(method='POST', 94 url='http://127.0.0.1:8000/test/', 95 data={
'k1': 'v1', 'k2': 'v2'}, 96 cookies=obj) 97 98 99 def param_files():100 # 发送文件101 # file_dict = {
102 # 'f1': open('readme', 'rb')103 # }104 # requests.request(method='POST',105 # url='http://127.0.0.1:8000/test/',106 # files=file_dict)107 108 # 发送文件,定制文件名109 # file_dict = {
110 # 'f1': ('test.txt', open('readme', 'rb'))111 # }112 # requests.request(method='POST',113 # url='http://127.0.0.1:8000/test/',114 # files=file_dict)115 116 # 发送文件,定制文件名117 # file_dict = {
118 # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")119 # }120 # requests.request(method='POST',121 # url='http://127.0.0.1:8000/test/',122 # files=file_dict)123 124 # 发送文件,定制文件名125 # file_dict = {
126 # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})127 # }128 # requests.request(method='POST',129 # url='http://127.0.0.1:8000/test/',130 # files=file_dict)131 132 pass133 134 135 def param_auth():136 from requests.auth import HTTPBasicAuth, HTTPDigestAuth137 138 ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))139 print(ret.text)140 141 # ret = requests.get('http://192.168.1.1',142 # auth=HTTPBasicAuth('admin', 'admin'))143 # ret.encoding = 'gbk'144 # print(ret.text)145 146 # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))147 # print(ret)148 #149 150 151 def param_timeout():152 # ret = requests.get('http://google.com/', timeout=1)153 # print(ret)154 155 # ret = requests.get('http://google.com/', timeout=(5, 1))156 # print(ret)157 pass158 159 160 def param_allow_redirects():161 ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)162 print(ret.text)163 164 165 def param_proxies():166 # proxies = {
167 # "http": "61.172.249.96:80",168 # "https": "http://61.185.219.126:3128",169 # }170 171 # proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}172 173 # ret = requests.get("http://www.proxy360.cn/Proxy", proxies=proxies)174 # print(ret.headers)175 176 177 # from requests.auth import HTTPProxyAuth178 #179 # proxyDict = {
180 # 'http': '77.75.105.165',181 # 'https': '77.75.105.165'182 # }183 # auth = HTTPProxyAuth('username', 'mypassword')184 #185 # r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)186 # print(r.text)187 188 pass189 190 191 def param_stream():192 ret = requests.get('http://127.0.0.1:8000/test/', stream=True)193 print(ret.content)194 ret.close()195 196 # from contextlib import closing197 # with closing(requests.get('http://httpbin.org/get', stream=True)) as r:198 # # 在此处理响应。199 # for i in r.iter_content():200 # print(i)201 202 203 def requests_session():204 import requests205 206 session = requests.Session()207 208 ### 1、首先登陆任何页面,获取cookie209 210 i1 = session.get(url="http://dig.chouti.com/help/service")211 212 ### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权213 i2 = session.post(214 url="http://dig.chouti.com/login",215 data={216 'phone': "8615131255089",217 'password': "xxxxxx",218 'oneMonth': ""219 }220 )221 222 i3 = session.post(223 url="http://dig.chouti.com/link/vote?linksId=8589623",224 )225 print(i3.text)226 227 参数示例

 

BeautifulSoup

BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。

1 from bs4 import BeautifulSoup 2   3 html_doc = """ 4 The Dormouse's story 5  6 asdf 7     
8
The Dormouse's story总共 9

f

10
11
Once upon a time there were three little sisters; and their names were12
Elsfie,13
Lacie and14
Tillie;15 and they lived at the bottom of a well.
16 ad
sf17

...

18 19 20 """21 22 soup = BeautifulSoup(html_doc, features="lxml")23 # 找到第一个a标签24 tag1 = soup.find(name='a')25 # 找到所有的a标签26 tag2 = soup.find_all(name='a')27 # 找到id=link2的标签28 tag3 = soup.select('#link2')

安装:

1 pip3 install beautifulsoup4

使用示例:

1 from bs4 import BeautifulSoup 2   3 html_doc = """ 4 The Dormouse's story 5  6     ... 7  8  9 """10  11 soup = BeautifulSoup(html_doc, features="lxml")

1. name,标签名称

1 # tag = soup.find('a')2 # name = tag.name # 获取3 # print(name)4 # tag.name = 'span' # 设置5 # print(soup)

2. attr,标签属性

1 # tag = soup.find('a')2 # attrs = tag.attrs    # 获取3 # print(attrs)4 # tag.attrs = {'ik':123} # 设置5 # tag.attrs['id'] = 'iiiii' # 设置6 # print(soup)

3. children,所有子标签

1 # body = soup.find('body')2 # v = body.children

4. children,所有子子孙孙标签

1 # body = soup.find('body')2 # v = body.descendants

5. clear,将标签的所有子标签全部清空(保留标签名)

1 # tag = soup.find('body')2 # tag.clear()3 # print(soup)

6. decompose,递归的删除所有的标签

1 # body = soup.find('body')2 # body.decompose()3 # print(soup)

7. extract,递归的删除所有的标签,并获取删除的标签

1 # body = soup.find('body')2 # v = body.extract()3 # print(soup)

8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)

1 # body = soup.find('body')2 # v = body.decode()3 # v = body.decode_contents()4 # print(v)

9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)

1 # body = soup.find('body')2 # v = body.encode()3 # v = body.encode_contents()4 # print(v)

10. find,获取匹配的第一个标签

1 # tag = soup.find('a')2 # print(tag)3 # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')4 # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')5 # print(tag)

11. find_all,获取匹配的所有标签

1 # tags = soup.find_all('a') 2 # print(tags) 3   4 # tags = soup.find_all('a',limit=1) 5 # print(tags) 6   7 # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') 8 # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') 9 # print(tags)10  11  12 # ####### 列表 #######13 # v = soup.find_all(name=['a','div'])14 # print(v)15  16 # v = soup.find_all(class_=['sister0', 'sister'])17 # print(v)18  19 # v = soup.find_all(text=['Tillie'])20 # print(v, type(v[0]))21  22  23 # v = soup.find_all(id=['link1','link2'])24 # print(v)25  26 # v = soup.find_all(href=['link1','link2'])27 # print(v)28  29 # ####### 正则 #######30 import re31 # rep = re.compile('p')32 # rep = re.compile('^p')33 # v = soup.find_all(name=rep)34 # print(v)35  36 # rep = re.compile('sister.*')37 # v = soup.find_all(class_=rep)38 # print(v)39  40 # rep = re.compile('http://www.oldboy.com/static/.*')41 # v = soup.find_all(href=rep)42 # print(v)43  44 # ####### 方法筛选 #######45 # def func(tag):46 # return tag.has_attr('class') and tag.has_attr('id')47 # v = soup.find_all(name=func)48 # print(v)49  50  51 # ## get,获取标签属性52 # tag = soup.find('a')53 # v = tag.get('id')54 # print(v)

12. has_attr,检查标签是否具有该属性

1 # tag = soup.find('a')2 # v = tag.has_attr('id')3 # print(v)

13. get_text,获取标签内部文本内容

1 # tag = soup.find('a')2 # v = tag.get_text('id')3 # print(v)

14. index,检查标签在某标签中的索引位置

1 # tag = soup.find('body')2 # v = tag.index(tag.find('div'))3 # print(v)4  5 # tag = soup.find('body')6 # for i,v in enumerate(tag):7 # print(i,v)

15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,

     判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'

1 # tag = soup.find('br')2 # v = tag.is_empty_element3 # print(v)

16. 当前的关联标签

1 # soup.next 2 # soup.next_element 3 # soup.next_elements 4 # soup.next_sibling 5 # soup.next_siblings 6   7 # 8 # tag.previous 9 # tag.previous_element10 # tag.previous_elements11 # tag.previous_sibling12 # tag.previous_siblings13  14 #15 # tag.parent16 # tag.parents

17. 查找某标签的关联标签

1 # tag.find_next(...) 2 # tag.find_all_next(...) 3 # tag.find_next_sibling(...) 4 # tag.find_next_siblings(...) 5   6 # tag.find_previous(...) 7 # tag.find_all_previous(...) 8 # tag.find_previous_sibling(...) 9 # tag.find_previous_siblings(...)10  11 # tag.find_parent(...)12 # tag.find_parents(...)13  14 # 参数同find_all

 

转载于:https://www.cnblogs.com/YingLai/p/6836823.html

你可能感兴趣的文章
牛人们的博客地址
查看>>
Zabbix是什么?
查看>>
源码:COCO微博
查看>>
面向对象预习随笔
查看>>
大数据概念炒作周期模型
查看>>
排序模型
查看>>
Dede推荐文章与热点文章不显示?
查看>>
React 3
查看>>
Topshelf 使用
查看>>
Linux --Apache服务搭建
查看>>
20145325张梓靖 实验三 "敏捷开发与XP实践"
查看>>
JavaScript面试题
查看>>
[转帖]架构师眼中的高并发架构
查看>>
ios的一些开源资源
查看>>
HTTP 错误 500.21 - Internal Server Error 解决方案
查看>>
Bucks sign Sanders to $44 million extension
查看>>
【PHP】Windows下配置用mail()发送邮件
查看>>
Nhibernate和EF的区别
查看>>
基于java spring框架开发部标1078视频监控平台精华文章索引
查看>>
人类简史
查看>>