方法一：自己创建APP

1. 创建APP

进入https://apps.twitter.com/，创建自己的app。只有有了app才可以访问twitter的api并抓取数据。只需创建最简单的app即可，各种信息随意填写，并不需要进一步的认证，我们要的只是app的Consumer Key (API Key)， Consumer Secret (API Secret)， Access Token 和 Access Token Secret。鉴于单app的爬取次数限制，可以申请很多app来提高总次数。

2. 确定要使用的API

twitter提供多种类型的api，其中常用的有REST API和Streaming API。前者是常见的api类型，后者则可以跟踪监视一个用户或者一个话题。

REST API下面有很多的api，有价值爬取的有以下几个：

GET statuses/user_timeline：返回一个用户发的推文。注意twitter里回复也相当于发推文。
GET friends/ids：返回一个用户的followees。
GET followers/ids：返回一个用户的followers。
GET users/show：返回一个用户的信息。

# -*- codeing =utf-8 -*-
# @Time : 2020/10/21 16:43
# @Author:yuchuan
# @File : crawl_tweets_by_id.py
# @Software : PyCharm
import requests
import json
import urllib.parse
import csv
import pandas as pd
from itertools import  chain
import numpy as np
from datetime import datetime, timedelta
import time
import calendar
#getuserID可通过user screenname获取ID，不用人工去获取
def main():
        #包装header，伪装成浏览器。
				#*******************为需要替换部分
        headers = {
            "cookie": 'guest_id=v1:164775775099786971; d_prefs=MToxLGNvbnNlbnRfdmVyc2lvbjoyLHRleHRfdmVyc2lvbjoxMDAw; guest_id_ads=v1:164775775099786971; guest_id_marketing=v1:164775775099786971; personalization_id="v1_Gxv55pYehjUUIqIzDWKURw=="; _ga=GA1.2.1337685308.1647757781; _gid=GA1.2.1172757769.1647757781; kdt=p7UZYMTikXsQrnReETEWpS60OvEm9FE2jnoM1XA4; auth_token=46c7ede4ccb9ecf77a4d15b07884d903293215d0; ct0=3377c7c5232fe924d9e29fcec79a9f94353ebb29dd5504b7dbd9cddff6d3402fd19a48de16bd04fd830a5fdbff90420a9d37f27f768cbda3a081b4eff5f729448435b85fc608d4abd083dcc7d3de3809; twid=u=1499860202291531782; att=1-tG82KoUAicixFemkiupEht8SE0BroiRsYUYySREd; des_opt_in=Y; external_referer=padhuUp37zixoA2Yz6IlsoQTSjz5FgRcKMoWWYN3PEQ=|0|8e8t2xd8A2w=',
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
            "x-csrf-token": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36",
            "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
        }
        #也可直接获取id 爬取
        username = pd.read_csv('*******************.csv')
        for i in range(0, len(username['username'])):
            print(str(username['username'][i]))
            csv_username=str(username['username'][i])
            userID=getuserID(csv_username,headers)
            twitterURL = "https://twitter.com/i/api/2/timeline/profile/" + userID + ".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&userId=" + userID + "&ext=mediaStats%2ChighlightedLabel"
            flag=True
            content = []
            full_content=[]
            response = connect(headers, twitterURL)
            #flag 相当于确定获取到的内容是否在当日里。flag = false 停止爬取
            while (flag):
                # 建立连接
                response = connect(headers, twitterURL)
                # formatRes修改源码的类型
                responseJson = formatRes(response.content)
                # 爬取每个json中 所有推文以及时间，转推，点赞等
                content = parsetweets(responseJson)
                # 将每个json中的内容添加到一个列表中
                full_content.extend(content)
                #获取下一页推文的json包
                twitterURL = getNewURL(responseJson,userID)
                flag=CtrlFlag(content)
                # n = n - 1
            print("------------------------------------------------------------------------------------------------\n------------------------------------------------------------------------------------------------")
            # 提取只要当天的推文
            everydaytweet=todaytweet(full_content)
            # 将内容保存到CSV中，每个用户一个CSV
            saveData(everydaytweet, csv_username)
            time.sleep(30)

#     获取当天的推文：本来想法：直接在CSV中进行排序，然后截取当天推文。时间排序没成功：：：：：直接在列表中截取当天的推文，再保存在CSV文件中可以。
def CtrlFlag(content):
    flag=True
    time = (todaytime() + timedelta(hours=-8)).strftime("%Y-%m-%d")
    count=0
    for i in range(0,len(content)):
        if content[i][0][0:10] not in str(time):
            count=count+1
        if count==len(content):
            flag=False
    return flag
def getuserID(username,headers):
    connectURL = "https://twitter.com/i/api/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + username + "%22%2C%22withHighlightedLabel%22%3Atrue%7D"
    print(connectURL)
    response=connect(headers,connectURL)
    responseJson= formatRes(response.content)
    # print(responseJson)
    data=responseJson['data']['user']
    # print(data)
    userID=find('rest_id',data)
    return userID

def todaytweet(full_content):
    content=[]
    #todaytime是香港时间，-8获得UTC时间，与爬取的created_at时间统一
    time=(todaytime()+ timedelta(hours=-8)).strftime("%Y-%m-%d")
    for i in range(0,len(full_content)):

        if full_content[i][0][0:10] in str(time):
            content.append(full_content[i])
    return content

# 爬取推文，和时间等，对时间进行格式化****/**/**
def parsetweets(dict):
    dict = dict['globalObjects']['tweets']
    full_text=findAll('full_text',dict)
    created_at=findAll('created_at',dict)
    favorite_count=findAll('favorite_count',dict)
    quote_count=findAll('quote_count',dict)
    reply_count=findAll('reply_count',dict)
    retweet_count=findAll('retweet_count',dict)
    formatcreated_at=[]
    time1=[]
    time2=[]
    utc_time1=[]
    for i in range(0,len(created_at)):
        #从twitter爬下来的时候，就是UTC时间，统一为UTC时间，将本地时间（香港）-8小时。美国时间+5小时
        time1.append(datetime.strptime(created_at[i],"%a %b %d %H:%M:%S +0000 %Y"))
        time2.append(datetime.strftime(time1[i],'%Y-%m-%d %H:%M:%S'))   #datatime转str
    tweetData = []
    #tweetData = list(chain.from_iterable(zip( created_at,full_text)))  # 合并两个列表
    # print(tweetData)
    for i in range(0,len(full_text)):
        tweetData.append([time2[i],full_text[i],favorite_count[i],quote_count[i],reply_count[i],retweet_count[i]])
    return tweetData

# 当前日期 20201029格式，此时时间type：datetime,调用可能需要转换成str
def todaytime():
    today=datetime.today()
    return today

#保存到CSV中,每个人保存在一个CSV文件中、
def saveData(content,filename):
    filetime = todaytime().strftime('%y%y%m%d')
    filename=filetime+" "+filename
    filepath = 'D:/twitterdata/'+filename+'.csv'
    name=['Time', 'Tweet','Favorite','Quote','Reply','Retweet']
    Data=pd.DataFrame(columns=name,data=content)
    Data.to_csv(filepath,encoding='utf-8-sig')

# 直接查找键值 find
def find(target, dictData, notFound='没找到'):
    queue = [dictData]
    while len(queue) > 0:
        data = queue.pop()
        for key, value in data.items():
            if key == target: return value
            elif type(value) == dict: queue.append(value)
    return notFound
def findAll(target, dictData, notFound=[]):
    #print(dictData)
    result = []
    for key, values in dictData.items():
        content = values[target]
        result.append(content)
    #print(result)
    return result

# 获取到cursor，并组成新的url
def getNewURL(responseJson,userID):
    responseJsonCursor1 = responseJson['timeline']['instructions'][0]['addEntries']['entries'][-1]#这是字典，是列表中的最后一个元素
    cursorASCII=find('cursor',responseJsonCursor1)
    cursorASCII2 = find('value', cursorASCII)
    cursor=urllib.parse.quote(cursorASCII2)
    newURL="https://twitter.com/i/api/2/timeline/profile/"+userID+".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&cursor="+cursor+"&userId="+userID+"&ext=mediaStats%2ChighlightedLabel"
    return newURL

#格式化获取到的json//bytes转string loads()将string读入字典中
def formatRes(res):
       strRes = str(res, 'utf-8')
       dictRes = json.loads(strRes)
       return dictRes

#设置代理proxies，链接获取网页数据。代理部分需要自己设置
def connect(headers,twitterURL):
       proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890", }
       response = requests.get(twitterURL,headers = headers, proxies=proxies)
       return response

if __name__=="__main__":   #当程序执行时
    main()

python 爬虫

#学习笔记

制作twitter爬虫

http://example.com/制作twitter爬虫.html

Author

CDxiaodong

Posted on

March 10, 2022

Licensed under

手抄linux各文件夹及文件说明 Previous

从外网 Weblogic 打进内网，再到约束委派接管域控205359 Next