制作twitter爬虫

方法一:自己创建APP

1. 创建APP

进入https://apps.twitter.com/,创建自己的app。只有有了app才可以访问twitter的api并抓取数据。只需创建最简单的app即可,各种信息随意填写,并不需要进一步的认证,我们要的只是app的Consumer Key (API Key), Consumer Secret (API Secret), Access Token 和 Access Token Secret。鉴于单app的爬取次数限制,可以申请很多app来提高总次数。

2. 确定要使用的API

twitter提供多种类型的api,其中常用的有REST API和Streaming API。前者是常见的api类型,后者则可以跟踪监视一个用户或者一个话题。

REST API下面有很多的api,有价值爬取的有以下几个:

GET statuses/user_timeline:返回一个用户发的推文。注意twitter里回复也相当于发推文。
GET friends/ids:返回一个用户的followees。
GET followers/ids:返回一个用户的followers。
GET users/show:返回一个用户的信息。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- codeing =utf-8 -*-
# @Time : 2020/10/21 16:43
# @Author:yuchuan
# @File : crawl_tweets_by_id.py
# @Software : PyCharm
import requests
import json
import urllib.parse
import csv
import pandas as pd
from itertools import chain
import numpy as np
from datetime import datetime, timedelta
import time
import calendar
#getuserID可通过user screenname获取ID,不用人工去获取
def main():
#包装header,伪装成浏览器。
#*******************为需要替换部分
headers = {
"cookie": 'guest_id=v1:164775775099786971; d_prefs=MToxLGNvbnNlbnRfdmVyc2lvbjoyLHRleHRfdmVyc2lvbjoxMDAw; guest_id_ads=v1:164775775099786971; guest_id_marketing=v1:164775775099786971; personalization_id="v1_Gxv55pYehjUUIqIzDWKURw=="; _ga=GA1.2.1337685308.1647757781; _gid=GA1.2.1172757769.1647757781; kdt=p7UZYMTikXsQrnReETEWpS60OvEm9FE2jnoM1XA4; auth_token=46c7ede4ccb9ecf77a4d15b07884d903293215d0; ct0=3377c7c5232fe924d9e29fcec79a9f94353ebb29dd5504b7dbd9cddff6d3402fd19a48de16bd04fd830a5fdbff90420a9d37f27f768cbda3a081b4eff5f729448435b85fc608d4abd083dcc7d3de3809; twid=u=1499860202291531782; att=1-tG82KoUAicixFemkiupEht8SE0BroiRsYUYySREd; des_opt_in=Y; external_referer=padhuUp37zixoA2Yz6IlsoQTSjz5FgRcKMoWWYN3PEQ=|0|8e8t2xd8A2w=',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
"x-csrf-token": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
}
#也可直接获取id 爬取
username = pd.read_csv('*******************.csv')
for i in range(0, len(username['username'])):
print(str(username['username'][i]))
csv_username=str(username['username'][i])
userID=getuserID(csv_username,headers)
twitterURL = "https://twitter.com/i/api/2/timeline/profile/" + userID + ".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&userId=" + userID + "&ext=mediaStats%2ChighlightedLabel"
flag=True
content = []
full_content=[]
response = connect(headers, twitterURL)
#flag 相当于确定获取到的内容是否在当日里。flag = false 停止爬取
while (flag):
# 建立连接
response = connect(headers, twitterURL)
# formatRes修改源码的类型
responseJson = formatRes(response.content)
# 爬取每个json中 所有推文以及时间,转推,点赞等
content = parsetweets(responseJson)
# 将每个json中的内容添加到一个列表中
full_content.extend(content)
#获取下一页推文的json包
twitterURL = getNewURL(responseJson,userID)
flag=CtrlFlag(content)
# n = n - 1
print("------------------------------------------------------------------------------------------------\n------------------------------------------------------------------------------------------------")
# 提取只要当天的推文
everydaytweet=todaytweet(full_content)
# 将内容保存到CSV中,每个用户一个CSV
saveData(everydaytweet, csv_username)
time.sleep(30)

# 获取当天的推文:本来想法:直接在CSV中进行排序,然后截取当天推文。时间排序没成功:::::直接在列表中截取当天的推文,再保存在CSV文件中可以。
def CtrlFlag(content):
flag=True
time = (todaytime() + timedelta(hours=-8)).strftime("%Y-%m-%d")
count=0
for i in range(0,len(content)):
if content[i][0][0:10] not in str(time):
count=count+1
if count==len(content):
flag=False
return flag
def getuserID(username,headers):
connectURL = "https://twitter.com/i/api/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + username + "%22%2C%22withHighlightedLabel%22%3Atrue%7D"
print(connectURL)
response=connect(headers,connectURL)
responseJson= formatRes(response.content)
# print(responseJson)
data=responseJson['data']['user']
# print(data)
userID=find('rest_id',data)
return userID

def todaytweet(full_content):
content=[]
#todaytime是香港时间,-8获得UTC时间,与爬取的created_at时间统一
time=(todaytime()+ timedelta(hours=-8)).strftime("%Y-%m-%d")
for i in range(0,len(full_content)):

if full_content[i][0][0:10] in str(time):
content.append(full_content[i])
return content

# 爬取推文,和时间等,对时间进行格式化****/**/**
def parsetweets(dict):
dict = dict['globalObjects']['tweets']
full_text=findAll('full_text',dict)
created_at=findAll('created_at',dict)
favorite_count=findAll('favorite_count',dict)
quote_count=findAll('quote_count',dict)
reply_count=findAll('reply_count',dict)
retweet_count=findAll('retweet_count',dict)
formatcreated_at=[]
time1=[]
time2=[]
utc_time1=[]
for i in range(0,len(created_at)):
#从twitter爬下来的时候,就是UTC时间,统一为UTC时间,将本地时间(香港)-8小时。美国时间+5小时
time1.append(datetime.strptime(created_at[i],"%a %b %d %H:%M:%S +0000 %Y"))
time2.append(datetime.strftime(time1[i],'%Y-%m-%d %H:%M:%S')) #datatime转str
tweetData = []
#tweetData = list(chain.from_iterable(zip( created_at,full_text))) # 合并两个列表
# print(tweetData)
for i in range(0,len(full_text)):
tweetData.append([time2[i],full_text[i],favorite_count[i],quote_count[i],reply_count[i],retweet_count[i]])
return tweetData

# 当前日期 20201029格式,此时时间type:datetime,调用可能需要转换成str
def todaytime():
today=datetime.today()
return today

#保存到CSV中,每个人保存在一个CSV文件中、
def saveData(content,filename):
filetime = todaytime().strftime('%y%y%m%d')
filename=filetime+" "+filename
filepath = 'D:/twitterdata/'+filename+'.csv'
name=['Time', 'Tweet','Favorite','Quote','Reply','Retweet']
Data=pd.DataFrame(columns=name,data=content)
Data.to_csv(filepath,encoding='utf-8-sig')

# 直接查找键值 find
def find(target, dictData, notFound='没找到'):
queue = [dictData]
while len(queue) > 0:
data = queue.pop()
for key, value in data.items():
if key == target: return value
elif type(value) == dict: queue.append(value)
return notFound
def findAll(target, dictData, notFound=[]):
#print(dictData)
result = []
for key, values in dictData.items():
content = values[target]
result.append(content)
#print(result)
return result

# 获取到cursor,并组成新的url
def getNewURL(responseJson,userID):
responseJsonCursor1 = responseJson['timeline']['instructions'][0]['addEntries']['entries'][-1]#这是字典,是列表中的最后一个元素
cursorASCII=find('cursor',responseJsonCursor1)
cursorASCII2 = find('value', cursorASCII)
cursor=urllib.parse.quote(cursorASCII2)
newURL="https://twitter.com/i/api/2/timeline/profile/"+userID+".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&cursor="+cursor+"&userId="+userID+"&ext=mediaStats%2ChighlightedLabel"
return newURL

#格式化获取到的json//bytes转string loads()将string读入字典中
def formatRes(res):
strRes = str(res, 'utf-8')
dictRes = json.loads(strRes)
return dictRes

#设置代理proxies,链接获取网页数据。代理部分需要自己设置
def connect(headers,twitterURL):
proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890", }
response = requests.get(twitterURL,headers = headers, proxies=proxies)
return response

if __name__=="__main__": #当程序执行时
main()


制作twitter爬虫
http://example.com/制作twitter爬虫.html
Author
CDxiaodong
Posted on
March 10, 2022
Licensed under