multithreaded pgn database download working with list of random user agents
This commit is contained in:
		@@ -1,3 +1,5 @@
 | 
				
			|||||||
 | 
					import user_agents
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
import requests
 | 
					import requests
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
@@ -6,12 +8,15 @@ import concurrent.futures
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def playerArchives(username):
 | 
					def playerArchives(username):
 | 
				
			||||||
    # Create archive list. This is a list of pages containing lists of games used for futher downloading
 | 
					    # Create archive list. This is a list of pages containing lists of games used for futher downloading
 | 
				
			||||||
    url = "https://api.chess.com/pub/player/{}/games/archives".format(username)
 | 
					 | 
				
			||||||
    archive = requests.get(url).json()['archives']
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    file_path = '/home/unknown/Documents/projects/chess/archives' #TODO come up with general saving scheme
 | 
					    user_agent = random.choice(user_agents.user_agents) # load up random user agent from list
 | 
				
			||||||
    # TODO maybe path should be getcwd() instead?
 | 
					    headers = {
 | 
				
			||||||
 | 
					            'User-Agent':user_agent
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					    url = f"https://api.chess.com/pub/player/{username}/games/archives"
 | 
				
			||||||
 | 
					    archive = requests.get(url, headers=headers).json()['archives']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    file_path = os.getcwd()
 | 
				
			||||||
    file_name = os.path.join(file_path, username+'_archive.txt')
 | 
					    file_name = os.path.join(file_path, username+'_archive.txt')
 | 
				
			||||||
    with open(file_name, 'w') as f:
 | 
					    with open(file_name, 'w') as f:
 | 
				
			||||||
        json.dump(archive, f)
 | 
					        json.dump(archive, f)
 | 
				
			||||||
@@ -19,67 +24,40 @@ def playerArchives(username):
 | 
				
			|||||||
    return archive
 | 
					    return archive
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Provide a URL to the archive including from the above function, playerArchives() 
 | 
					def playerMonthly(url=None):
 | 
				
			||||||
# or else manually provide chess.com's required GET request parameters
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# SAVE NESTED PGN TO PGN DATABASE FILE:
 | 
					    user_agent = random.choice(user_agents.user_agents) # load up random user agent from list
 | 
				
			||||||
 | 
					    headers = {
 | 
				
			||||||
 | 
					            'User-Agent':user_agent
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#pgn_games = []
 | 
					 | 
				
			||||||
#with open('username-database.pgn', 'w') as f:
 | 
					 | 
				
			||||||
#    for game in pgn_games:
 | 
					 | 
				
			||||||
#        output.write(game)
 | 
					 | 
				
			||||||
#        output.write("\n\n")
 | 
					 | 
				
			||||||
#
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def playerMonthly(url=None): #TODO you now need to provide username and email in for the user agent:
 | 
					 | 
				
			||||||
    # check https://www.chess.com/clubs/forum/view/error-403-in-member-profile?page=2
 | 
					 | 
				
			||||||
    # session = requests.session()
 | 
					 | 
				
			||||||
    # session.headers["User-Agent"] = "username:hipposcottimus, email:spbeach46@gmail.com"
 | 
					 | 
				
			||||||
    # session.get(url).json()
 | 
					 | 
				
			||||||
    if url:
 | 
					    if url:
 | 
				
			||||||
        url=url
 | 
					        url=url+'/pgn' # connect to multi-game pgn download endpoint by appending a "pgn" to the url provided by url in archive list
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        username=input("username: ")
 | 
					        username=input("username: ")
 | 
				
			||||||
        YYYY=input("year in YYYY: ")
 | 
					        YYYY=input("year in YYYY: ")
 | 
				
			||||||
        MM=input("month in MM: ") 
 | 
					        MM=input("month in MM: ") 
 | 
				
			||||||
        url = "https://api.chess.com/pub/player/{}/games/{YYYY}/{MM}".format(username, YYYY, MM)
 | 
					        url = f"https://api.chess.com/pub/player/{username}/games/{YYYY}/{MM}/pgn"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # get and save games list in .pgn format
 | 
					    # get and save games list in .pgn format
 | 
				
			||||||
    data = requests.get(url)#.json()
 | 
					    games = requests.get(url, headers=headers).content.decode("utf-8")
 | 
				
			||||||
    file_path = '/home/unknown/Documents/projects/chess/games'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#    for game in data['games']: # TODO just append each game to a single text file somehow and save as pgn. Can't load as pgn from json/dict
 | 
					 | 
				
			||||||
#        uuid = game['uuid']
 | 
					 | 
				
			||||||
#        filename = os.path.join(file_path, uuid+".pgn")
 | 
					 | 
				
			||||||
#        with open(filename, 'w') as f:
 | 
					 | 
				
			||||||
#            f.write(game['pgn']) # writes a single game to .pgn format
 | 
					 | 
				
			||||||
    return data
 | 
					 | 
				
			||||||
# return games_list
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return games
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Multithreaded games download
 | 
					# Multithreaded games download
 | 
				
			||||||
 | 
					def multiThredd(username, archive):
 | 
				
			||||||
def threddGames(username=None, archive=None):
 | 
					    with open(archive) as f:
 | 
				
			||||||
 | 
					        archive = json.load(f)
 | 
				
			||||||
    path = '/home/unknown/Documents/projects/chess/games/{}'.format(username)
 | 
					 | 
				
			||||||
    # TODO maybe path should be getcwd() instead?
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    try:
 | 
					 | 
				
			||||||
        os.makedirs(path)
 | 
					 | 
				
			||||||
    except OSError:
 | 
					 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if archive:
 | 
					 | 
				
			||||||
        with open(archive) as f:
 | 
					 | 
				
			||||||
            archive = json.load(f)
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        archive = playerArchives(username)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # async download games
 | 
					    # async download games
 | 
				
			||||||
 | 
					    games_list = []
 | 
				
			||||||
    with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
					    with concurrent.futures.ThreadPoolExecutor() as executor:
 | 
				
			||||||
        for future in executor.map(playerMonthly, archive):
 | 
					        for future in executor.map(playerMonthly, archive):
 | 
				
			||||||
            future #TODO incomplete as is.
 | 
					            games_list.extend(future)
 | 
				
			||||||
 | 
					    pgn_db = "\n\n".join(games_list)
 | 
				
			||||||
 | 
					    with open(username+'.pgn', 'w') as f:
 | 
				
			||||||
 | 
					        f.write(pgn_db)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return pgn_db
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# TODO  Gameplan: use playerArchives to produce list, concurrent futures to send out multithreaded requests,
 | 
					 | 
				
			||||||
# either append to another empty list or write directly to a file immediately after download and parse
 | 
					 | 
				
			||||||
# vars in concurrent.futures: url_list from playerArchives, response_to_parse, empty
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										8
									
								
								user_agents.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								user_agents.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,8 @@
 | 
				
			|||||||
 | 
					user_agents = [
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (Windows NT 6.1; rv:109.0) Gecko/20100101 Firefox/113.0",
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (Android 12; Mobile; rv:109.0) Gecko/113.0 Firefox/113.0",
 | 
				
			||||||
 | 
					    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/113.0",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
		Reference in New Issue
	
	Block a user