diff --git a/README.md b/README.md index 59121e8..ae582dd 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,11 @@ for search_query in search_queries: proxy_rotation_index += 1 ``` +## GOOGLE_ABUSE_EXEMPTION cookie + +If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the +`SearchClient` object. + ## &tbs= URL filter clarification The `&tbs=` parameter is used to specify either verbatim or time-based filters. @@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops ## Contributors * [KennBro](https://github.com/KennBro) - +* [ArshansGithub](https://github.com/ArshansGithub) - diff --git a/requirements.txt b/requirements.txt index 7463f32..c51383f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ beautifulsoup4>=4.9.3 -requests>=2.26.0 +requests>=2.31.0 requests[socks] diff --git a/yagooglesearch/__init__.py b/yagooglesearch/__init__.py index 9c7723d..3ee141a 100644 --- a/yagooglesearch/__init__.py +++ b/yagooglesearch/__init__.py @@ -12,7 +12,7 @@ # Custom Python libraries. -__version__ = "1.6.1" +__version__ = "1.7.0" # Logging ROOT_LOGGER = logging.getLogger("yagooglesearch") @@ -86,8 +86,8 @@ def __init__( verify_ssl=True, verbosity=5, verbose_output=False, + google_exemption=None, ): - """ SearchClient :param str query: Query string. Must NOT be url-encoded. @@ -118,6 +118,8 @@ def __init__( This may need to be disabled in some HTTPS proxy instances. :param int verbosity: Logging and console output verbosity. :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False. + :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain + google searches. Defaults to None. :rtype: List of str :return: List of URLs found or list of {"rank", "title", "description", "url"} @@ -142,6 +144,7 @@ def __init__( self.verify_ssl = verify_ssl self.verbosity = verbosity self.verbose_output = verbose_output + self.google_exemption = google_exemption # Assign log level. ROOT_LOGGER.setLevel((6 - self.verbosity) * 10) @@ -151,8 +154,12 @@ def __init__( ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.") self.num = 100 - # Initialize cookies to None, will be updated with each request in get_page(). - self.cookies = None + # Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None. + # It will be updated with each request in get_page(). + if self.google_exemption: + self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption} + else: + self.cookies = None # Used later to ensure there are not any URL parameter collisions. self.url_parameters = ( @@ -178,7 +185,6 @@ def __init__( # Update proxy_dict if a proxy is provided. if proxy: - # Standardize case since the scheme will be checked against a hard-coded list. self.proxy = proxy.lower() @@ -321,7 +327,12 @@ def get_page(self, url): ROOT_LOGGER.info(f"Requesting URL: {url}") response = requests.get( - url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl + url, + proxies=self.proxy_dict, + headers=headers, + cookies=self.cookies, + timeout=15, + verify=self.verify_ssl, ) # Update the cookies. @@ -341,7 +352,6 @@ def get_page(self, url): # See https://github.com/benbusby/whoogle-search/issues/311 try: if response.cookies["CONSENT"].startswith("PENDING+"): - ROOT_LOGGER.warning( "Looks like your IP address is sourcing from a European Union location...your search results may " "vary, but I'll try and work around this by updating the cookie." @@ -381,7 +391,6 @@ def get_page(self, url): html = response.text elif http_response_code == 429: - ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.") # Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a @@ -431,7 +440,6 @@ def search(self): # Loop until we reach the maximum result results found or there are no more search results found to reach # max_search_result_urls_to_return. while total_valid_links_found <= self.max_search_result_urls_to_return: - ROOT_LOGGER.info( f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / " f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}" @@ -484,7 +492,6 @@ def search(self): # Process every anchored URL. for a in anchors: - # Get the URL from the anchor tag. try: link = a["href"] @@ -498,7 +505,6 @@ def search(self): continue if self.verbose_output: - # Extract the URL title. try: title = a.get_text() @@ -520,7 +526,6 @@ def search(self): # Check if URL has already been found. if link not in self.search_result_list: - # Increase the counters. valid_links_found_in_this_search += 1 total_valid_links_found += 1