diff --git a/play_scraper/scraper.py b/play_scraper/scraper.py index ffae983..bb7c4db 100644 --- a/play_scraper/scraper.py +++ b/play_scraper/scraper.py @@ -1,15 +1,8 @@ # -*- coding: utf-8 -*- import logging -try: - from urllib import quote_plus - from urlparse import urljoin -except ImportError: - from urllib.parse import urljoin, quote_plus -try: - basestring -except NameError: - basestring = str +from typing import Dict, List, Optional, Union +from urllib.parse import urljoin, quote_plus import requests from bs4 import BeautifulSoup, SoupStrainer @@ -28,18 +21,46 @@ ) -class PlayScraper(object): - def __init__(self, hl='en', gl='us'): +class PlayScraperError(Exception): + """Exceção customizada para erros do PlayScraper""" + pass + + +class PlayScraper: + """ + Cliente otimizado para scraping da Google Play Store. + + Suporta busca de apps, detalhes, coleções, desenvolvedor e sugestões. + """ + + # Constantes da classe + MAX_RESULTS = 120 + MAX_PAGE_LIMIT = 500 + MAX_SEARCH_PAGE = 12 + MAX_DEV_PAGE_NUM = 12 + APP_CARD_SELECTOR = 'div[data-uitype="500"]' + + def __init__(self, hl: str = 'en', gl: str = 'us') -> None: + """ + Inicializa o PlayScraper. + + Args: + hl: Código do idioma da interface (padrão: 'en') + gl: Código do país para geolocalização (padrão: 'us') + + Raises: + PlayScraperError: Se os códigos de idioma ou país forem inválidos + """ self.language = hl - if self.language not in HL_LANGUAGE_CODES: - raise ValueError('{hl} is not a valid language interface code.'.format( - hl=self.language)) self.geolocation = gl - if self.geolocation not in GL_COUNTRY_CODES: - raise ValueError('{gl} is not a valid geolocation country code.'.format( - gl=self.geolocation)) - self.params = {'hl': self.language, - 'gl': self.geolocation} + + self._validate_language() + self._validate_geolocation() + + self.params = { + 'hl': self.language, + 'gl': self.geolocation + } self._base_url = s.BASE_URL self._suggestion_url = s.SUGGESTION_URL @@ -47,38 +68,93 @@ def __init__(self, hl='en', gl='us'): self._pagtok = s.PAGE_TOKENS self._log = logging.getLogger(__name__) - def _parse_multiple_apps(self, list_response): - """Extracts app ids from a list's Response object, sends GET requests to - each app, parses detailed info and returns all apps in a list. + def _validate_language(self) -> None: + """Valida o código do idioma""" + if self.language not in HL_LANGUAGE_CODES: + raise PlayScraperError( + f'{self.language} não é um código de idioma válido.' + ) + + def _validate_geolocation(self) -> None: + """Valida o código do país""" + if self.geolocation not in GL_COUNTRY_CODES: + raise PlayScraperError( + f'{self.geolocation} não é um código de país válido.' + ) - :param list_response: the Response object from a list request - :return: a list of app dictionaries + def _parse_multiple_apps(self, list_response: requests.Response) -> List[Dict]: """ - list_strainer = SoupStrainer('span', - {'class': 'preview-overlay-container'}) - soup = BeautifulSoup(list_response.content, - 'lxml', - from_encoding='utf8', - parse_only=list_strainer) - - app_ids = [x.attrs['data-docid'] - for x in soup.select('span.preview-overlay-container')] + Extrai IDs de apps de uma resposta de lista e busca detalhes de cada app. + + Args: + list_response: Objeto Response de uma requisição de lista + + Returns: + Lista de dicionários com dados dos apps + """ + list_strainer = SoupStrainer('span', {'class': 'preview-overlay-container'}) + soup = BeautifulSoup( + list_response.content, + 'lxml', + from_encoding='utf8', + parse_only=list_strainer + ) + + app_ids = [ + span.attrs['data-docid'] + for span in soup.select('span.preview-overlay-container') + ] return multi_futures_app_request(app_ids, params=self.params) - def details(self, app_id): - """Sends a GET request and parses an application's details. + def _parse_app_cards(self, soup: BeautifulSoup) -> List[Dict]: + """ + Parseia cards de apps do HTML. + + Args: + soup: Objeto BeautifulSoup com o HTML + + Returns: + Lista de dicionários com informações básicas dos apps + """ + return [ + parse_card_info(app_card) + for app_card in soup.select(self.APP_CARD_SELECTOR) + ] + + def _make_soup(self, response: requests.Response) -> BeautifulSoup: + """ + Cria objeto BeautifulSoup a partir de uma resposta HTTP. + + Args: + response: Resposta HTTP + + Returns: + Objeto BeautifulSoup + """ + return BeautifulSoup(response.content, 'lxml', from_encoding='utf8') - :param app_id: the app to retrieve details, e.g. 'com.nintendo.zaaa' - :return: a dictionary of app details + def details(self, app_id: str) -> Dict: + """ + Busca detalhes de uma aplicação específica. + + Args: + app_id: ID da aplicação (ex: 'com.nintendo.zaaa') + + Returns: + Dicionário com detalhes da aplicação + + Raises: + PlayScraperError: Se o ID da aplicação for inválido """ url = build_url('details', app_id) try: response = send_request('GET', url, params=self.params) - soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf8') + soup = self._make_soup(response) except requests.exceptions.HTTPError as e: - raise ValueError('Invalid application ID: {app}. {error}'.format( - app=app_id, error=e)) + raise PlayScraperError( + f'ID de aplicação inválido: {app_id}. Erro: {e}' + ) app_json = parse_app_details(soup) app_json.update({ @@ -87,198 +163,285 @@ def details(self, app_id): }) return app_json - def collection(self, collection_id, category_id=None, results=None, - page=None, age=None, detailed=False): - """Sends a POST request and fetches a list of applications belonging to - the collection and an optional category. - - :param collection_id: the collection id, e.g. 'NEW_FREE'. - :param category_id: (optional) the category id, e.g. 'GAME_ACTION'. - :param results: the number of apps to retrieve at a time. - :param page: page number to retrieve; limitation: page * results <= 500. - :param age: an age range to filter by (only for FAMILY categories) - :param detailed: if True, sends request per app for its full detail - :return: a list of app dictionaries + def collection(self, + collection_id: str, + category_id: Optional[str] = None, + results: Optional[int] = None, + page: Optional[int] = None, + age: Optional[str] = None, + detailed: bool = False) -> List[Dict]: """ - if (collection_id not in COLLECTIONS and - not collection_id.startswith('promotion')): - raise ValueError('Invalid collection_id \'{collection}\'.'.format( - collection=collection_id)) - collection_name = COLLECTIONS.get(collection_id) or collection_id - + Busca lista de aplicações de uma coleção específica. + + Args: + collection_id: ID da coleção (ex: 'NEW_FREE') + category_id: ID da categoria (opcional, ex: 'GAME_ACTION') + results: Número de apps para recuperar + page: Número da página para recuperar + age: Filtro de faixa etária (apenas para categorias FAMILY) + detailed: Se True, busca detalhes completos de cada app + + Returns: + Lista de dicionários com dados dos apps + + Raises: + PlayScraperError: Para parâmetros inválidos + """ + self._validate_collection_params(collection_id, category_id, results, page) + + collection_name = COLLECTIONS.get(collection_id, collection_id) category = '' if category_id is None else CATEGORIES.get(category_id) - if category is None: - raise ValueError('Invalid category_id \'{category}\'.'.format( - category=category_id)) - - results = s.NUM_RESULTS if results is None else results - if results > 120: - raise ValueError('Number of results cannot be more than 120.') + + results = results or s.NUM_RESULTS + page = page or 0 - page = 0 if page is None else page - if page * results > 500: - raise ValueError('Start (page * results) cannot be greater than 500.') - - if category.startswith('FAMILY') and age is not None: + if category and category.startswith('FAMILY') and age is not None: self.params['age'] = AGE_RANGE[age] url = build_collection_url(category, collection_name) data = generate_post_data(results, page) response = send_request('POST', url, data, self.params) - if detailed: - apps = self._parse_multiple_apps(response) - else: - soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf8') - apps = [parse_card_info(app_card) - for app_card in soup.select('div[data-uitype="500"]')] - - return apps - - def developer(self, developer, results=None, page=None, detailed=False): - """Sends a POST request and retrieves a list of the developer's - published applications on the Play Store. - - :param developer: developer name to retrieve apps from, e.g. 'Disney' - :param results: the number of app results to retrieve - :param page: the page number to retrieve - :param detailed: if True, sends request per app for its full detail - :return: a list of app dictionaries + return self._process_app_response(response, detailed) + + def _validate_collection_params(self, + collection_id: str, + category_id: Optional[str], + results: Optional[int], + page: Optional[int]) -> None: + """Valida parâmetros da coleção""" + if (collection_id not in COLLECTIONS and + not collection_id.startswith('promotion')): + raise PlayScraperError(f'collection_id inválido: {collection_id}') + + if category_id is not None and CATEGORIES.get(category_id) is None: + raise PlayScraperError(f'category_id inválido: {category_id}') + + if results is not None and results > self.MAX_RESULTS: + raise PlayScraperError( + f'Número de resultados não pode ser maior que {self.MAX_RESULTS}' + ) + + if page is not None and results is not None: + if page * results > self.MAX_PAGE_LIMIT: + raise PlayScraperError( + f'Início (page * results) não pode ser maior que {self.MAX_PAGE_LIMIT}' + ) + + def developer(self, + developer: str, + results: Optional[int] = None, + page: Optional[int] = None, + detailed: bool = False) -> List[Dict]: """ - if not isinstance(developer, basestring) or developer.isdigit(): - raise ValueError('Parameter \'developer\' must be the developer name, not the developer id.') - - results = s.DEV_RESULTS if results is None else results - page = 0 if page is None else page + Busca aplicações publicadas por um desenvolvedor específico. + + Args: + developer: Nome do desenvolvedor (ex: 'Disney') + results: Número de resultados para recuperar + page: Número da página para recuperar + detailed: Se True, busca detalhes completos de cada app + + Returns: + Lista de dicionários com dados dos apps + + Raises: + PlayScraperError: Para parâmetros inválidos + """ + self._validate_developer_params(developer, results, page) + + results = results or s.DEV_RESULTS + page = page or 0 page_num = (results // 20) * page - if not 0 <= page_num <= 12: - raise ValueError('Page out of range. (results // 20) * page must be between 0 - 12') pagtok = self._pagtok[page_num] url = build_url('developer', developer) data = generate_post_data(results, 0, pagtok) response = send_request('POST', url, data, self.params) - if detailed: - apps = self._parse_multiple_apps(response) - else: - soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf8') - apps = [parse_card_info(app) - for app in soup.select('div[data-uitype="500"]')] - - return apps - - def suggestions(self, query): - """Sends a GET request and retrieves a list of autocomplete suggestions - matching the query term(s). - - :param query: search query term(s) to retrieve autocomplete suggestions - :return: a list of suggested search queries, up to 5 + return self._process_app_response(response, detailed) + + def _validate_developer_params(self, + developer: str, + results: Optional[int], + page: Optional[int]) -> None: + """Valida parâmetros do desenvolvedor""" + if not isinstance(developer, str) or developer.isdigit(): + raise PlayScraperError( + 'Parâmetro "developer" deve ser o nome do desenvolvedor, ' + 'não o ID do desenvolvedor' + ) + + if results is not None and page is not None: + page_num = (results // 20) * page + if not 0 <= page_num <= self.MAX_DEV_PAGE_NUM: + raise PlayScraperError( + f'Página fora de alcance. (results // 20) * page ' + f'deve estar entre 0 e {self.MAX_DEV_PAGE_NUM}' + ) + + def suggestions(self, query: str) -> List[str]: """ - if not query: - raise ValueError("Cannot get suggestions for an empty query.") + Busca sugestões de autocompletar para um termo de pesquisa. + + Args: + query: Termo(s) de pesquisa para buscar sugestões + + Returns: + Lista de sugestões de pesquisa (até 5) + + Raises: + PlayScraperError: Se a query estiver vazia + """ + if not query or not query.strip(): + raise PlayScraperError("Não é possível buscar sugestões para uma query vazia") - self.params.update({ + params = self.params.copy() + params.update({ 'json': 1, 'c': 0, 'query': query, }) - response = send_request('GET', - self._suggestion_url, - params=self.params) - suggestions = [q['s'] for q in response.json()] - return suggestions - - def search(self, query, page=None, detailed=False): - """Sends a POST request and retrieves a list of applications matching - the query term(s). + response = send_request('GET', self._suggestion_url, params=params) + return [q['s'] for q in response.json()] - :param query: search query term(s) to retrieve matching apps - :param page: the page number to retrieve. Max is 12. - :param detailed: if True, sends request per app for its full detail - :return: a list of apps matching search terms + def search(self, + query: str, + page: Optional[int] = None, + detailed: bool = False) -> List[Dict]: + """ + Busca aplicações que correspondem aos termos de pesquisa. + + Args: + query: Termo(s) de pesquisa + page: Número da página para recuperar (máximo 12) + detailed: Se True, busca detalhes completos de cada app + + Returns: + Lista de apps que correspondem aos termos de pesquisa + + Raises: + PlayScraperError: Se a página estiver fora do alcance """ - page = 0 if page is None else int(page) + page = page or 0 + if page > len(self._pagtok) - 1: - raise ValueError('Parameter \'page\' ({page}) must be between 0 and 12.'.format( - page=page)) + raise PlayScraperError( + f'Parâmetro "page" ({page}) deve estar entre 0 e {self.MAX_SEARCH_PAGE}' + ) pagtok = self._pagtok[page] data = generate_post_data(0, 0, pagtok) - self.params.update({ + params = self.params.copy() + params.update({ 'q': quote_plus(query), 'c': 'apps', }) - response = send_request('POST', self._search_url, data, self.params) - soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf8') + response = send_request('POST', self._search_url, data, params) + return self._process_app_response(response, detailed) - if detailed: - apps = self._parse_multiple_apps(response) - else: - apps = [parse_card_info(app) - for app in soup.select('div[data-uitype="500"]')] - - return apps - - def similar(self, app_id, detailed=False, **kwargs): - """Sends a GET request, follows the redirect, and retrieves a list of - applications similar to the specified app. - - :param app_id: app to retrieve details from, e.g. 'com.nintendo.zaaa' - :param detailed: if True, sends request per app for its full detail - :return: a list of similar apps + def similar(self, + app_id: str, + detailed: bool = False) -> List[Dict]: + """ + Busca aplicações similares ao app especificado. + + Args: + app_id: ID do app para buscar similares (ex: 'com.nintendo.zaaa') + detailed: Se True, busca detalhes completos de cada app + + Returns: + Lista de apps similares """ url = build_url('similar', app_id) - response = send_request('GET', - url, - params=self.params, - allow_redirects=True) - soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf8') + response = send_request('GET', url, params=self.params, allow_redirects=True) + return self._process_app_response(response, detailed) + def _process_app_response(self, + response: requests.Response, + detailed: bool) -> List[Dict]: + """ + Processa resposta HTTP e retorna lista de apps. + + Args: + response: Resposta HTTP + detailed: Se deve buscar detalhes completos + + Returns: + Lista de dicionários com dados dos apps + """ if detailed: - apps = self._parse_multiple_apps(response) + return self._parse_multiple_apps(response) else: - apps = [parse_card_info(app) - for app in soup.select('div[data-uitype="500"]')] + soup = self._make_soup(response) + return self._parse_app_cards(soup) - return apps - - def categories(self, ignore_promotions=True): - """Sends a GET request to the front page (app store base url), parses - and returns a list of all available categories. + def categories(self, ignore_promotions: bool = True) -> Dict[str, Dict]: + """ + Busca e retorna todas as categorias disponíveis. + + Args: + ignore_promotions: Se deve ignorar promoções + + Returns: + Dicionário com informações das categorias """ categories = {} strainer = SoupStrainer('ul', {'class': 'submenu-item-wrapper'}) response = send_request('GET', s.BASE_URL, params=self.params) - soup = BeautifulSoup(response.content, - 'lxml', - from_encoding='utf8', - parse_only=strainer) - category_links = soup.select('a.child-submenu-link') - category_links += soup.select('a.parent-submenu-link') - age_query = '?age=' - + soup = BeautifulSoup( + response.content, + 'lxml', + from_encoding='utf8', + parse_only=strainer + ) + + category_links = ( + soup.select('a.child-submenu-link') + + soup.select('a.parent-submenu-link') + ) + for cat in category_links: - url = urljoin(s.BASE_URL, cat.attrs['href']) - category_id = url.split('/')[-1] - name = cat.string.strip() + category_data = self._extract_category_data(cat, ignore_promotions) + if category_data: + category_id = category_data['category_id'] + if category_id not in categories: + categories[category_id] = category_data - if age_query in category_id: - category_id = 'FAMILY' - url = url.split('?')[0] - name = 'Family' + return categories - if category_id not in categories: - if ignore_promotions and '/store/apps/category/' not in url: - continue + def _extract_category_data(self, + cat_link, + ignore_promotions: bool) -> Optional[Dict]: + """ + Extrai dados de uma categoria a partir de um link. + + Args: + cat_link: Elemento de link da categoria + ignore_promotions: Se deve ignorar promoções + + Returns: + Dicionário com dados da categoria ou None se deve ser ignorada + """ + url = urljoin(s.BASE_URL, cat_link.attrs['href']) + category_id = url.split('/')[-1] + name = cat_link.string.strip() + age_query = '?age=' - categories[category_id] = { - 'name': name, - 'url': url, - 'category_id': category_id} + if age_query in category_id: + category_id = 'FAMILY' + url = url.split('?')[0] + name = 'Family' - return categories + if ignore_promotions and '/store/apps/category/' not in url: + return None + + return { + 'name': name, + 'url': url, + 'category_id': category_id + }