1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
import requests import json import urllib.parse import csv import pandas as pd from itertools import chain import numpy as np from datetime import datetime, timedelta import time import calendar
def main(): headers = { "cookie": 'guest_id=v1:164775775099786971; d_prefs=MToxLGNvbnNlbnRfdmVyc2lvbjoyLHRleHRfdmVyc2lvbjoxMDAw; guest_id_ads=v1:164775775099786971; guest_id_marketing=v1:164775775099786971; personalization_id="v1_Gxv55pYehjUUIqIzDWKURw=="; _ga=GA1.2.1337685308.1647757781; _gid=GA1.2.1172757769.1647757781; kdt=p7UZYMTikXsQrnReETEWpS60OvEm9FE2jnoM1XA4; auth_token=46c7ede4ccb9ecf77a4d15b07884d903293215d0; ct0=3377c7c5232fe924d9e29fcec79a9f94353ebb29dd5504b7dbd9cddff6d3402fd19a48de16bd04fd830a5fdbff90420a9d37f27f768cbda3a081b4eff5f729448435b85fc608d4abd083dcc7d3de3809; twid=u=1499860202291531782; att=1-tG82KoUAicixFemkiupEht8SE0BroiRsYUYySREd; des_opt_in=Y; external_referer=padhuUp37zixoA2Yz6IlsoQTSjz5FgRcKMoWWYN3PEQ=|0|8e8t2xd8A2w=', "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36", "x-csrf-token": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", } username = pd.read_csv('*******************.csv') for i in range(0, len(username['username'])): print(str(username['username'][i])) csv_username=str(username['username'][i]) userID=getuserID(csv_username,headers) twitterURL = "https://twitter.com/i/api/2/timeline/profile/" + userID + ".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&userId=" + userID + "&ext=mediaStats%2ChighlightedLabel" flag=True content = [] full_content=[] response = connect(headers, twitterURL) while (flag): response = connect(headers, twitterURL) responseJson = formatRes(response.content) content = parsetweets(responseJson) full_content.extend(content) twitterURL = getNewURL(responseJson,userID) flag=CtrlFlag(content) print("------------------------------------------------------------------------------------------------\n------------------------------------------------------------------------------------------------") everydaytweet=todaytweet(full_content) saveData(everydaytweet, csv_username) time.sleep(30)
def CtrlFlag(content): flag=True time = (todaytime() + timedelta(hours=-8)).strftime("%Y-%m-%d") count=0 for i in range(0,len(content)): if content[i][0][0:10] not in str(time): count=count+1 if count==len(content): flag=False return flag def getuserID(username,headers): connectURL = "https://twitter.com/i/api/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22" + username + "%22%2C%22withHighlightedLabel%22%3Atrue%7D" print(connectURL) response=connect(headers,connectURL) responseJson= formatRes(response.content) data=responseJson['data']['user'] userID=find('rest_id',data) return userID
def todaytweet(full_content): content=[] time=(todaytime()+ timedelta(hours=-8)).strftime("%Y-%m-%d") for i in range(0,len(full_content)):
if full_content[i][0][0:10] in str(time): content.append(full_content[i]) return content
def parsetweets(dict): dict = dict['globalObjects']['tweets'] full_text=findAll('full_text',dict) created_at=findAll('created_at',dict) favorite_count=findAll('favorite_count',dict) quote_count=findAll('quote_count',dict) reply_count=findAll('reply_count',dict) retweet_count=findAll('retweet_count',dict) formatcreated_at=[] time1=[] time2=[] utc_time1=[] for i in range(0,len(created_at)): time1.append(datetime.strptime(created_at[i],"%a %b %d %H:%M:%S +0000 %Y")) time2.append(datetime.strftime(time1[i],'%Y-%m-%d %H:%M:%S')) tweetData = [] for i in range(0,len(full_text)): tweetData.append([time2[i],full_text[i],favorite_count[i],quote_count[i],reply_count[i],retweet_count[i]]) return tweetData
def todaytime(): today=datetime.today() return today
def saveData(content,filename): filetime = todaytime().strftime('%y%y%m%d') filename=filetime+" "+filename filepath = 'D:/twitterdata/'+filename+'.csv' name=['Time', 'Tweet','Favorite','Quote','Reply','Retweet'] Data=pd.DataFrame(columns=name,data=content) Data.to_csv(filepath,encoding='utf-8-sig')
def find(target, dictData, notFound='没找到'): queue = [dictData] while len(queue) > 0: data = queue.pop() for key, value in data.items(): if key == target: return value elif type(value) == dict: queue.append(value) return notFound def findAll(target, dictData, notFound=[]): result = [] for key, values in dictData.items(): content = values[target] result.append(content) return result
def getNewURL(responseJson,userID): responseJsonCursor1 = responseJson['timeline']['instructions'][0]['addEntries']['entries'][-1] cursorASCII=find('cursor',responseJsonCursor1) cursorASCII2 = find('value', cursorASCII) cursor=urllib.parse.quote(cursorASCII2) newURL="https://twitter.com/i/api/2/timeline/profile/"+userID+".json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&include_tweet_replies=false&count=20&cursor="+cursor+"&userId="+userID+"&ext=mediaStats%2ChighlightedLabel" return newURL
def formatRes(res): strRes = str(res, 'utf-8') dictRes = json.loads(strRes) return dictRes
def connect(headers,twitterURL): proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890", } response = requests.get(twitterURL,headers = headers, proxies=proxies) return response
if __name__=="__main__": main()
|