multithreaded pgn database download working with list of random user agents
This commit is contained in:
parent
7b768f6f66
commit
5a51d4b151
@ -1,3 +1,5 @@
|
|||||||
|
import user_agents
|
||||||
|
import random
|
||||||
import requests
|
import requests
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
@ -6,12 +8,15 @@ import concurrent.futures
|
|||||||
|
|
||||||
def playerArchives(username):
|
def playerArchives(username):
|
||||||
# Create archive list. This is a list of pages containing lists of games used for futher downloading
|
# Create archive list. This is a list of pages containing lists of games used for futher downloading
|
||||||
url = "https://api.chess.com/pub/player/{}/games/archives".format(username)
|
|
||||||
archive = requests.get(url).json()['archives']
|
|
||||||
|
|
||||||
file_path = '/home/unknown/Documents/projects/chess/archives' #TODO come up with general saving scheme
|
user_agent = random.choice(user_agents.user_agents) # load up random user agent from list
|
||||||
# TODO maybe path should be getcwd() instead?
|
headers = {
|
||||||
|
'User-Agent':user_agent
|
||||||
|
}
|
||||||
|
url = f"https://api.chess.com/pub/player/{username}/games/archives"
|
||||||
|
archive = requests.get(url, headers=headers).json()['archives']
|
||||||
|
|
||||||
|
file_path = os.getcwd()
|
||||||
file_name = os.path.join(file_path, username+'_archive.txt')
|
file_name = os.path.join(file_path, username+'_archive.txt')
|
||||||
with open(file_name, 'w') as f:
|
with open(file_name, 'w') as f:
|
||||||
json.dump(archive, f)
|
json.dump(archive, f)
|
||||||
@ -19,67 +24,40 @@ def playerArchives(username):
|
|||||||
return archive
|
return archive
|
||||||
|
|
||||||
|
|
||||||
# Provide a URL to the archive including from the above function, playerArchives()
|
def playerMonthly(url=None):
|
||||||
# or else manually provide chess.com's required GET request parameters
|
|
||||||
|
|
||||||
# SAVE NESTED PGN TO PGN DATABASE FILE:
|
user_agent = random.choice(user_agents.user_agents) # load up random user agent from list
|
||||||
|
headers = {
|
||||||
|
'User-Agent':user_agent
|
||||||
|
}
|
||||||
|
|
||||||
#pgn_games = []
|
|
||||||
#with open('username-database.pgn', 'w') as f:
|
|
||||||
# for game in pgn_games:
|
|
||||||
# output.write(game)
|
|
||||||
# output.write("\n\n")
|
|
||||||
#
|
|
||||||
|
|
||||||
def playerMonthly(url=None): #TODO you now need to provide username and email in for the user agent:
|
|
||||||
# check https://www.chess.com/clubs/forum/view/error-403-in-member-profile?page=2
|
|
||||||
# session = requests.session()
|
|
||||||
# session.headers["User-Agent"] = "username:hipposcottimus, email:spbeach46@gmail.com"
|
|
||||||
# session.get(url).json()
|
|
||||||
if url:
|
if url:
|
||||||
url=url
|
url=url+'/pgn' # connect to multi-game pgn download endpoint by appending a "pgn" to the url provided by url in archive list
|
||||||
else:
|
else:
|
||||||
username=input("username: ")
|
username=input("username: ")
|
||||||
YYYY=input("year in YYYY: ")
|
YYYY=input("year in YYYY: ")
|
||||||
MM=input("month in MM: ")
|
MM=input("month in MM: ")
|
||||||
url = "https://api.chess.com/pub/player/{}/games/{YYYY}/{MM}".format(username, YYYY, MM)
|
url = f"https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM}/pgn"
|
||||||
|
|
||||||
# get and save games list in .pgn format
|
# get and save games list in .pgn format
|
||||||
data = requests.get(url)#.json()
|
games = requests.get(url, headers=headers).content.decode("utf-8")
|
||||||
file_path = '/home/unknown/Documents/projects/chess/games'
|
|
||||||
|
|
||||||
# for game in data['games']: # TODO just append each game to a single text file somehow and save as pgn. Can't load as pgn from json/dict
|
|
||||||
# uuid = game['uuid']
|
|
||||||
# filename = os.path.join(file_path, uuid+".pgn")
|
|
||||||
# with open(filename, 'w') as f:
|
|
||||||
# f.write(game['pgn']) # writes a single game to .pgn format
|
|
||||||
return data
|
|
||||||
# return games_list
|
|
||||||
|
|
||||||
|
return games
|
||||||
|
|
||||||
# Multithreaded games download
|
# Multithreaded games download
|
||||||
|
def multiThredd(username, archive):
|
||||||
def threddGames(username=None, archive=None):
|
with open(archive) as f:
|
||||||
|
archive = json.load(f)
|
||||||
path = '/home/unknown/Documents/projects/chess/games/{}'.format(username)
|
|
||||||
# TODO maybe path should be getcwd() instead?
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.makedirs(path)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if archive:
|
|
||||||
with open(archive) as f:
|
|
||||||
archive = json.load(f)
|
|
||||||
else:
|
|
||||||
archive = playerArchives(username)
|
|
||||||
|
|
||||||
# async download games
|
# async download games
|
||||||
|
games_list = []
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
for future in executor.map(playerMonthly, archive):
|
for future in executor.map(playerMonthly, archive):
|
||||||
future #TODO incomplete as is.
|
games_list.extend(future)
|
||||||
|
pgn_db = "\n\n".join(games_list)
|
||||||
|
with open(username+'.pgn', 'w') as f:
|
||||||
|
f.write(pgn_db)
|
||||||
|
|
||||||
|
return pgn_db
|
||||||
|
|
||||||
|
|
||||||
# TODO Gameplan: use playerArchives to produce list, concurrent futures to send out multithreaded requests,
|
|
||||||
# either append to another empty list or write directly to a file immediately after download and parse
|
|
||||||
# vars in concurrent.futures: url_list from playerArchives, response_to_parse, empty
|
|
||||||
|
8
user_agents.py
Normal file
8
user_agents.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
user_agents = [
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||||
|
"Mozilla/5.0 (Android 12; Mobile; rv:109.0) Gecko/113.0 Firefox/113.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||||
|
]
|
Loading…
Reference in New Issue
Block a user