From 5a51d4b1519d8a5ec86b9d0689fe843b09f94bd5 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Nov 2023 01:12:40 -0700 Subject: [PATCH 1/4] multithreaded pgn database download working with list of random user agents --- player_games.py | 80 ++++++++++++++++++------------------------------- user_agents.py | 8 +++++ 2 files changed, 37 insertions(+), 51 deletions(-) create mode 100644 user_agents.py diff --git a/player_games.py b/player_games.py index f780af2..9a814d0 100644 --- a/player_games.py +++ b/player_games.py @@ -1,3 +1,5 @@ +import user_agents +import random import requests import re import os @@ -6,12 +8,15 @@ import concurrent.futures def playerArchives(username): # Create archive list. This is a list of pages containing lists of games used for futher downloading - url = "https://api.chess.com/pub/player/{}/games/archives".format(username) - archive = requests.get(url).json()['archives'] - file_path = '/home/unknown/Documents/projects/chess/archives' #TODO come up with general saving scheme - # TODO maybe path should be getcwd() instead? + user_agent = random.choice(user_agents.user_agents) # load up random user agent from list + headers = { + 'User-Agent':user_agent + } + url = f"https://api.chess.com/pub/player/{username}/games/archives" + archive = requests.get(url, headers=headers).json()['archives'] + file_path = os.getcwd() file_name = os.path.join(file_path, username+'_archive.txt') with open(file_name, 'w') as f: json.dump(archive, f) @@ -19,67 +24,40 @@ def playerArchives(username): return archive -# Provide a URL to the archive including from the above function, playerArchives() -# or else manually provide chess.com's required GET request parameters +def playerMonthly(url=None): -# SAVE NESTED PGN TO PGN DATABASE FILE: + user_agent = random.choice(user_agents.user_agents) # load up random user agent from list + headers = { + 'User-Agent':user_agent + } -#pgn_games = [] -#with open('username-database.pgn', 'w') as f: -# for game in pgn_games: -# output.write(game) -# output.write("\n\n") -# - -def playerMonthly(url=None): #TODO you now need to provide username and email in for the user agent: - # check https://www.chess.com/clubs/forum/view/error-403-in-member-profile?page=2 - # session = requests.session() - # session.headers["User-Agent"] = "username:hipposcottimus, email:spbeach46@gmail.com" - # session.get(url).json() if url: - url=url + url=url+'/pgn' # connect to multi-game pgn download endpoint by appending a "pgn" to the url provided by url in archive list else: username=input("username: ") YYYY=input("year in YYYY: ") MM=input("month in MM: ") - url = "https://api.chess.com/pub/player/{}/games/{YYYY}/{MM}".format(username, YYYY, MM) + url = f"https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM}/pgn" # get and save games list in .pgn format - data = requests.get(url)#.json() - file_path = '/home/unknown/Documents/projects/chess/games' - -# for game in data['games']: # TODO just append each game to a single text file somehow and save as pgn. Can't load as pgn from json/dict -# uuid = game['uuid'] -# filename = os.path.join(file_path, uuid+".pgn") -# with open(filename, 'w') as f: -# f.write(game['pgn']) # writes a single game to .pgn format - return data -# return games_list + games = requests.get(url, headers=headers).content.decode("utf-8") + return games # Multithreaded games download - -def threddGames(username=None, archive=None): - - path = '/home/unknown/Documents/projects/chess/games/{}'.format(username) - # TODO maybe path should be getcwd() instead? - - try: - os.makedirs(path) - except OSError: - pass - - if archive: - with open(archive) as f: - archive = json.load(f) - else: - archive = playerArchives(username) +def multiThredd(username, archive): + with open(archive) as f: + archive = json.load(f) # async download games + games_list = [] with concurrent.futures.ThreadPoolExecutor() as executor: for future in executor.map(playerMonthly, archive): - future #TODO incomplete as is. + games_list.extend(future) + pgn_db = "\n\n".join(games_list) + with open(username+'.pgn', 'w') as f: + f.write(pgn_db) + + return pgn_db + -# TODO Gameplan: use playerArchives to produce list, concurrent futures to send out multithreaded requests, -# either append to another empty list or write directly to a file immediately after download and parse -# vars in concurrent.futures: url_list from playerArchives, response_to_parse, empty diff --git a/user_agents.py b/user_agents.py new file mode 100644 index 0000000..b3b779c --- /dev/null +++ b/user_agents.py @@ -0,0 +1,8 @@ +user_agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35", + "Mozilla/5.0 (Windows NT 6.1; rv:109.0) Gecko/20100101 Firefox/113.0", + "Mozilla/5.0 (Android 12; Mobile; rv:109.0) Gecko/113.0 Firefox/113.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0", +] From dc8e0aa86e71a451c65b37d46f510af01c78f630 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Nov 2023 01:20:18 -0700 Subject: [PATCH 2/4] repeat commit --- player_games.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/player_games.py b/player_games.py index 9a814d0..0d8507d 100644 --- a/player_games.py +++ b/player_games.py @@ -6,6 +6,7 @@ import os import json import concurrent.futures + def playerArchives(username): # Create archive list. This is a list of pages containing lists of games used for futher downloading @@ -60,4 +61,3 @@ def multiThredd(username, archive): return pgn_db - From 67d3189bfe6c004b69dbe232ccc51dc97a6a0fa9 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Nov 2023 01:44:16 -0700 Subject: [PATCH 3/4] commit before checkout master --- player_games.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/player_games.py b/player_games.py index 0d8507d..96cb292 100644 --- a/player_games.py +++ b/player_games.py @@ -46,9 +46,19 @@ def playerMonthly(url=None): return games # Multithreaded games download -def multiThredd(username, archive): - with open(archive) as f: - archive = json.load(f) +def threddGames(username=None, archive=None): + + try: + path = os.getcwd()+'/archives' + os.makedirs(path) + except OSError: + pass + + if archive: + with open(archive) as f: + archive = json.load(f) + else: + archive = playerArchives(username) # async download games games_list = [] From 2c9e4f5986833f7ade6d6d069bbae89941d83732 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Nov 2023 01:57:39 -0700 Subject: [PATCH 4/4] cleaned up --- player_games.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/player_games.py b/player_games.py index 96cb292..b584a68 100644 --- a/player_games.py +++ b/player_games.py @@ -17,8 +17,14 @@ def playerArchives(username): url = f"https://api.chess.com/pub/player/{username}/games/archives" archive = requests.get(url, headers=headers).json()['archives'] - file_path = os.getcwd() - file_name = os.path.join(file_path, username+'_archive.txt') + try: + cwd = os.getcwd() + path = os.path.join(cwd, 'archives') + os.makedirs(path) + except OSError: + pass + + file_name = os.path.join(path, username+'_archive.txt') with open(file_name, 'w') as f: json.dump(archive, f) @@ -48,12 +54,6 @@ def playerMonthly(url=None): # Multithreaded games download def threddGames(username=None, archive=None): - try: - path = os.getcwd()+'/archives' - os.makedirs(path) - except OSError: - pass - if archive: with open(archive) as f: archive = json.load(f)