Source code for scholarly.author_parser

from .publication_parser import PublicationParser
import re
from .data_types import Author, AuthorSource, PublicationSource
from selenium.common.exceptions import WebDriverException

_CITATIONAUTHRE = r'user=([\w-]*)'
_HOST = 'https://scholar.google.com{0}'
_PAGESIZE = 100
_EMAILAUTHORRE = r'Verified email at '
_CITATIONAUTH = '/citations?hl=en&user={0}'
_COAUTH = ('https://scholar.google.com/citations?user={0}&hl=en'
           '#d=gsc_md_cod&u=%2Fcitations%3Fview_op%3Dlist_colleagues'
           '%26hl%3Den%26json%3D%26user%3D{0}%23t%3Dgsc_cod_lc')


[docs]class AuthorParser: """Returns an object for a single author""" def __init__(self, nav): self.nav = nav self._sections = {'basics', 'indices', 'counts', 'coauthors', 'publications'}
[docs] def get_author(self, __data)->Author: """ Fills the information for an author container """ author: Author = {'container_type': 'Author'} author['filled'] = [] if isinstance(__data, str): author['scholar_id'] = __data author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE else: author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0] pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id']) author['url_picture'] = _HOST.format(pic) name_class = self._find_tag_class_name(__data, 'h3', 'name') author['name'] = __data.find('h3', class_=name_class).text aff_class = self._find_tag_class_name(__data, 'div', 'aff') affiliation = __data.find('div', class_=aff_class) if affiliation: author['affiliation'] = affiliation.text email_class = self._find_tag_class_name(__data, 'div', 'eml') email = __data.find('div', class_=email_class) if email: author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text) int_class = self._find_tag_class_name(__data, 'a', 'one_int') interests = __data.find_all('a', class_=int_class) author['interests'] = [i.text.strip() for i in interests] citedby_class = self._find_tag_class_name(__data, 'div', 'cby') citedby = __data.find('div', class_=citedby_class) if citedby and citedby.text != '': author['citedby'] = int(citedby.text[9:]) return author
def _find_tag_class_name(self, __data, tag, text): elements = __data.find_all(tag) for element in elements: if 'class' in element.attrs and text in element.attrs['class'][0]: return element.attrs['class'][0] def _fill_basics(self, soup, author): author['name'] = soup.find('div', id='gsc_prf_in').text if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: res = soup.find('img', id='gsc_prf_pup-img') if res != None: if "avatar_scholar" not in res['src']: author['url_picture'] = res['src'] author['affiliation'] = soup.find('div', class_='gsc_prf_il').text author['interests'] = [i.text.strip() for i in soup.find_all('a', class_='gsc_prf_inta')] if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il") if email.text != "No verified email": author['email_domain'] = '@'+email.text.split(" ")[3] if author['source'] == AuthorSource.CO_AUTHORS_LIST: picture = soup.find('img', id="gsc_prf_pup-img").get('src') if "avatar_scholar" in picture: picture = _HOST.format(picture) author['url_picture'] = picture index = soup.find_all('td', class_='gsc_rsb_std') if index: author['citedby'] = int(index[0].text) def _fill_indices(self, soup, author): index = soup.find_all('td', class_='gsc_rsb_std') if index: author['citedby'] = int(index[0].text) author['citedby5y'] = int(index[1].text) author['hindex'] = int(index[2].text) author['hindex5y'] = int(index[3].text) author['i10index'] = int(index[4].text) author['i10index5y'] = int(index[5].text) else: author['hindex'] = 0 author['hindex5y'] = 0 author['i10index'] = 0 author['i10index5y'] = 0 def _fill_counts(self, soup, author): years = [int(y.text) for y in soup.find_all('span', class_='gsc_g_t')] cites = [int(c.text) for c in soup.find_all('span', class_='gsc_g_al')] author['cites_per_year'] = dict(zip(years, cites)) def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''): author['publications'] = list() pubstart = 0 url_citations = _CITATIONAUTH.format(author['scholar_id']) url_citations += sortby_str pub_parser = PublicationParser(self.nav) flag = False while True: for row in soup.find_all('tr', class_='gsc_a_tr'): new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY) author['publications'].append(new_pub) if (publication_limit) and (len(author['publications']) >= publication_limit): flag = True break if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag: pubstart += _PAGESIZE url = '{0}&cstart={1}&pagesize={2}'.format( url_citations, pubstart, _PAGESIZE) soup = self.nav._get_soup(url) else: break def _get_coauthors_short(self, soup): """Get the short list of coauthors from the profile page. To be called by _fill_coauthors method. """ coauthors = soup.find_all('span', class_='gsc_rsb_a_desc') coauthor_ids = [re.findall(_CITATIONAUTHRE, coauth('a')[0].get('href'))[0] for coauth in coauthors] coauthor_names = [coauth.find(tabindex="-1").text for coauth in coauthors] coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text for coauth in coauthors] return coauthor_ids, coauthor_names, coauthor_affils def _get_coauthors_long(self, author): """Get the long (>20) list of coauthors. Opens the dialog box to get the complete list of coauthors. To be called by _fill_coauthors method. """ wd = self.nav.pm._get_webdriver() try: wd.get(_COAUTH.format(author['scholar_id'])) # Wait up to 30 seconds for the various elements to be available. # The wait may be better set elsewhere. wd.implicitly_wait(30) coauthors = wd.find_elements_by_class_name('gs_ai_pho') coauthor_ids = [re.findall(_CITATIONAUTHRE, coauth.get_attribute('href'))[0] for coauth in coauthors] coauthor_names = [name.text for name in wd.find_elements_by_class_name('gs_ai_name')] coauthor_affils = [affil.text for affil in wd.find_elements_by_class_name('gs_ai_aff')] return coauthor_ids, coauthor_names, coauthor_affils finally: wd.quit() def _fill_coauthors(self, soup, author): # If "View All" is not found, scrape the page for coauthors if not soup.find_all('button', id='gsc_coauth_opn'): coauthor_info = self._get_coauthors_short(soup) else: # If "View All" is found, try opening the dialog box. # If geckodriver is not installed, resort to a short list and warn. try: coauthor_info = self._get_coauthors_long(author) except WebDriverException as err: coauthor_info = self._get_coauthors_short(soup) self.nav.logger.warning(err.msg) self.nav.logger.warning("Fetching only the top 20 coauthors") author['coauthors'] = [] for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info): new_coauthor = self.get_author(coauth_id) new_coauthor['name'] = coauth_name new_coauthor['affiliation'] = coauth_affil new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST author['coauthors'].append(new_coauthor)
[docs] def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0): """Populate the Author with information from their profile The `sections` argument allows for finer granularity of the profile information to be pulled. :param sections: Sections of author profile to be filled, defaults to ``[]``. * ``basics``: fills name, affiliation, and interests; * ``citations``: fills h-index, i10-index, and 5-year analogues; * ``counts``: fills number of citations per year; * ``coauthors``: fills co-authors; * ``publications``: fills publications; * ``[]``: fills all of the above :type sections: ['basics','citations','counts','coauthors','publications',[]] list, optional :param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. :type sortby: string :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit. :type publication_limit: int :returns: The filled object if fill was successfull, False otherwise. :rtype: Author or bool :Example:: .. testcode:: search_query = scholarly.search_author('Steven A Cholewiak') author = next(search_query) scholarly.pprint(author.fill(sections=['basic', 'citation_indices', 'co-authors'])) :Output:: .. testoutput:: {'affiliation': 'Vision Scientist', 'citedby': 304, 'citedby5y': 226, 'coauthors': [{'affiliation': 'Kurt Koffka Professor of Experimental ' 'Psychology, University of Giessen', 'filled': False, 'name': 'Roland Fleming', 'scholar_id': 'ruUKktgAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Vision Science, UC Berkeley', 'filled': False, 'name': 'Martin Banks', 'scholar_id': 'Smr99uEAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Durham University, Computer Science & Physics', 'filled': False, 'name': 'Gordon D. Love', 'scholar_id': '3xJXtlwAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of ECE, Purdue University', 'filled': False, 'name': 'Hong Z Tan', 'scholar_id': 'OiVOAHMAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Deepmind', 'filled': False, 'name': 'Ari Weinstein', 'scholar_id': 'MnUboHYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': "Brigham and Women's Hospital/Harvard Medical " 'School', 'filled': False, 'name': 'Chia-Chien Wu', 'scholar_id': 'dqokykoAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Psychology and Cognitive Science, ' 'Rutgers University', 'filled': False, 'name': 'Jacob Feldman', 'scholar_id': 'KoJrMIAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Research Scientist at Google Research, PhD ' 'Student at UC Berkeley', 'filled': False, 'name': 'Pratul Srinivasan', 'scholar_id': 'aYyDsZ0AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Formerly: Indiana University, Rutgers ' 'University, University of Pennsylvania', 'filled': False, 'name': 'Peter C. Pantelis', 'scholar_id': 'FoVvIK0AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor in Computer Science, University of ' 'California, Berkeley', 'filled': False, 'name': 'Ren Ng', 'scholar_id': '6H0mhLUAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Yale University', 'filled': False, 'name': 'Steven W Zucker', 'scholar_id': 'rNTIQXYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Brown University', 'filled': False, 'name': 'Ben Kunsberg', 'scholar_id': 'JPZWLKQAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Rutgers University, New Brunswick, NJ', 'filled': False, 'name': 'Manish Singh', 'scholar_id': '9XRvM88AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Silicon Valley Professor of ECE, Purdue ' 'University', 'filled': False, 'name': 'David S. Ebert', 'scholar_id': 'fD3JviYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Clinical Director, Neurolens Inc.,', 'filled': False, 'name': 'Vivek Labhishetty', 'scholar_id': 'tD7OGTQAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'MIT', 'filled': False, 'name': 'Joshua B. Tenenbaum', 'scholar_id': 'rRJ9wTJMUB8C', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Chief Scientist, isee AI', 'filled': False, 'name': 'Chris Baker', 'scholar_id': 'bTdT7hAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Psychology, Ewha Womans ' 'University', 'filled': False, 'name': 'Sung-Ho Kim', 'scholar_id': 'KXQb7CAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Assistant Professor, Boston University', 'filled': False, 'name': 'Melissa M. Kibbe', 'scholar_id': 'NN4GKo8AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Nvidia Corporation', 'filled': False, 'name': 'Peter Shirley', 'scholar_id': 'nHx9IgYAAAAJ', 'source': 'CO_AUTHORS_LIST'}], 'email_domain': '@berkeley.edu', 'filled': False, 'hindex': 9, 'hindex5y': 9, 'i10index': 8, 'i10index5y': 7, 'interests': ['Depth Cues', '3D Shape', 'Shape from Texture & Shading', 'Naive Physics', 'Haptics'], 'name': 'Steven A. Cholewiak, PhD', 'scholar_id': '4bahYMkAAAAJ', 'source': 'SEARCH_AUTHOR_SNIPPETS', 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ'} """ try: sections = [section.lower() for section in sections] sortby_str = '' if sortby == "year": sortby_str = '&view_op=list_works&sortby=pubdate' elif sortby != "citedby": raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'") url_citations = _CITATIONAUTH.format(author['scholar_id']) url_citations += sortby_str url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE) soup = self.nav._get_soup(url) if sections == []: for i in self._sections: if i not in author['filled']: (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) author['filled'].append(i) else: for i in sections: if i in self._sections and i not in author['filled']: (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) author['filled'].append(i) except Exception as e: raise(e) return author
def __repr__(self): return self.__str__()