Source code for scholarly.author_parser

from .publication_parser import PublicationParser
import re
from .data_types import Author, AuthorSource, PublicationSource, PublicAccess
import codecs

_CITATIONAUTHRE = r'user=([\w-]*)'
_HOST = 'https://scholar.google.com{0}'
_PAGESIZE = 100
_EMAILAUTHORRE = r'Verified email at '
_CITATIONAUTH = '/citations?hl=en&user={0}'
_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}'
_MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}"


[docs]class AuthorParser: """Returns an object for a single author""" def __init__(self, nav): self.nav = nav self._sections = ['basics', 'indices', 'counts', 'coauthors', 'publications', 'public_access']
[docs] def get_author(self, __data)->Author: """ Fills the information for an author container """ author: Author = {'container_type': 'Author'} author['filled'] = [] if isinstance(__data, str): author['scholar_id'] = __data author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE else: author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0] pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id']) author['url_picture'] = _HOST.format(pic) name_class = self._find_tag_class_name(__data, 'h3', 'name') author['name'] = __data.find('h3', class_=name_class).text aff_class = self._find_tag_class_name(__data, 'div', 'aff') affiliation = __data.find('div', class_=aff_class) if affiliation: author['affiliation'] = affiliation.text email_class = self._find_tag_class_name(__data, 'div', 'eml') email = __data.find('div', class_=email_class) if email: author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text) int_class = self._find_tag_class_name(__data, 'a', 'one_int') if int_class: interests = __data.find_all('a', class_=int_class) author['interests'] = [i.text.strip() for i in interests] else: author['interests'] = [] citedby_class = self._find_tag_class_name(__data, 'div', 'cby') citedby = __data.find('div', class_=citedby_class) if citedby and citedby.text != '': author['citedby'] = int(citedby.text[9:]) return author
def _find_tag_class_name(self, __data, tag, text): elements = __data.find_all(tag) for element in elements: if 'class' in element.attrs and text in element.attrs['class'][0]: return element.attrs['class'][0] def _fill_basics(self, soup, author): author['name'] = soup.find('div', id='gsc_prf_in').text if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: res = soup.find('img', id='gsc_prf_pup-img') if res is not None: if "avatar_scholar" not in res['src']: author['url_picture'] = res['src'] elif author['source'] == AuthorSource.CO_AUTHORS_LIST: picture = soup.find('img', id="gsc_prf_pup-img").get('src') if "avatar_scholar" in picture: picture = _HOST.format(picture) author['url_picture'] = picture affiliation = soup.find('div', class_='gsc_prf_il') author['affiliation'] = affiliation.text affiliation_link = affiliation.find('a') if affiliation_link: author['organization'] = int(affiliation_link.get('href').split("org=")[-1]) author['interests'] = [i.text.strip() for i in soup.find_all('a', class_='gsc_prf_inta')] email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il") if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE: if email.text != "No verified email": author['email_domain'] = '@'+email.text.split(" ")[3] homepage = email.find('a', class_="gsc_prf_ila") if homepage: author['homepage'] = homepage.get('href') index = soup.find_all('td', class_='gsc_rsb_std') if index: author['citedby'] = int(index[0].text) def _fill_indices(self, soup, author): index = soup.find_all('td', class_='gsc_rsb_std') if index: author['citedby'] = int(index[0].text) author['citedby5y'] = int(index[1].text) author['hindex'] = int(index[2].text) author['hindex5y'] = int(index[3].text) author['i10index'] = int(index[4].text) author['i10index5y'] = int(index[5].text) else: author['hindex'] = 0 author['hindex5y'] = 0 author['i10index'] = 0 author['i10index5y'] = 0 def _fill_counts(self, soup, author): years = [int(y.text) for y in soup.find_all('span', class_='gsc_g_t')] cites = [0]*len(years) for c in soup.find_all('a', class_='gsc_g_a'): i = int(c['style'].split(':')[-1]) cites[-i] = int(c.find('span', class_='gsc_g_al').text) author['cites_per_year'] = dict(zip(years, cites)) def _fill_public_access(self, soup, author): available = soup.find('div', class_='gsc_rsb_m_a') not_available = soup.find('div', class_='gsc_rsb_m_na') n_available, n_not_available = 0, 0 if available: n_available = int(re.sub("[.,]", "", available.text.split(" ")[0])) if not_available: n_not_available = int(re.sub("[.,]", "", not_available.text.split(" ")[0])) author["public_access"] = PublicAccess(available=n_available, not_available=n_not_available) if 'publications' not in author['filled']: return # Make a dictionary mapping to the publications publications = {pub['author_pub_id']:pub for pub in author['publications']} soup = self.nav._get_soup(_MANDATES.format(author['scholar_id'], _PAGESIZE)) while True: rows = soup.find_all('div', 'gsc_mnd_sec_na') if rows: for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", row['data-href'])[0] publications[author_pub_id]["public_access"] = False rows = soup.find_all('div', 'gsc_mnd_sec_avl') if rows: for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'): author_pub_id = re.findall(r"citation_for_view=([\w:-]*)", row['data-href'])[0] publications[author_pub_id]["public_access"] = True next_button = soup.find(class_="gs_btnPR") if next_button and "disabled" not in next_button.attrs: url = next_button['onclick'][17:-1] url = codecs.getdecoder("unicode_escape")(url)[0] soup = self.nav._get_soup(url) else: break def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''): author['publications'] = list() pubstart = 0 url_citations = _CITATIONAUTH.format(author['scholar_id']) url_citations += sortby_str pub_parser = PublicationParser(self.nav) flag = False while True: for row in soup.find_all('tr', class_='gsc_a_tr'): new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY) author['publications'].append(new_pub) if (publication_limit) and (len(author['publications']) >= publication_limit): flag = True break if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag: pubstart += _PAGESIZE url = '{0}&cstart={1}&pagesize={2}'.format( url_citations, pubstart, _PAGESIZE) soup = self.nav._get_soup(url) else: break def _get_coauthors_short(self, soup): """Get the short list of coauthors from the profile page. This method fetches the list of coauthors visible from an author's prilfe page alone. This may or may not be the complete list of coauthors. Note: ----- This method is to be called by _fill_coauthors method. """ coauthors = soup.find_all('span', class_='gsc_rsb_a_desc') coauthor_ids = [re.findall(_CITATIONAUTHRE, coauth('a')[0].get('href'))[0] for coauth in coauthors] coauthor_names = [coauth.find(tabindex="-1").text for coauth in coauthors] coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text for coauth in coauthors] return coauthor_ids, coauthor_names, coauthor_affils def _get_coauthors_long(self, author): """Get the long (>20) list of coauthors. This method fetches the complete list of coauthors bu opening a new page filled with the complete coauthor list. Note: ----- This method is to be called by _fill_coauthors method. """ soup = self.nav._get_soup(_COAUTH.format(author['scholar_id'])) coauthors = soup.find_all('div', 'gs_ai gs_scl') coauthor_ids = [re.findall(_CITATIONAUTHRE, coauth('a')[0].get('href'))[0] for coauth in coauthors] coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors] coauthor_affils = [coauth.find(class_="gs_ai_aff").text for coauth in coauthors] return coauthor_ids, coauthor_names, coauthor_affils def _fill_coauthors(self, soup, author): # If "View All" is not found, scrape the page for coauthors if not soup.find_all('button', id='gsc_coauth_opn'): coauthor_info = self._get_coauthors_short(soup) else: # If "View All" is found, try opening the dialog box. # If geckodriver is not installed, resort to a short list and warn. try: coauthor_info = self._get_coauthors_long(author) except Exception as err: coauthor_info = self._get_coauthors_short(soup) self.nav.logger.warning(err) self.nav.logger.warning("Fetching only the top 20 coauthors") author['coauthors'] = [] for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info): new_coauthor = self.get_author(coauth_id) new_coauthor['name'] = coauth_name new_coauthor['affiliation'] = coauth_affil new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST author['coauthors'].append(new_coauthor)
[docs] def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0): """Populate the Author with information from their profile The `sections` argument allows for finer granularity of the profile information to be pulled. :param sections: Sections of author profile to be filled, defaults to ``[]``. * ``basics``: fills name, affiliation, and interests; * ``citations``: fills h-index, i10-index, and 5-year analogues; * ``counts``: fills number of citations per year; * ``public_access``: fills number of articles with public access mandates; * ``coauthors``: fills co-authors; * ``publications``: fills publications; * ``[]``: fills all of the above :type sections: ['basics','citations','counts','public_access','coauthors','publications',[]] list, optional :param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'. :type sortby: string :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit. :type publication_limit: int :returns: The filled object if fill was successfull, False otherwise. :rtype: Author or bool :Example:: .. testcode:: search_query = scholarly.search_author('Steven A Cholewiak') author = next(search_query) author = scholarly.fill(author, sections=['basics', 'citations', 'coauthors']) scholarly.pprint(author) :Output:: .. testoutput:: {'affiliation': 'Vision Scientist', 'citedby': 304, 'citedby5y': 226, 'coauthors': [{'affiliation': 'Kurt Koffka Professor of Experimental ' 'Psychology, University of Giessen', 'filled': False, 'name': 'Roland Fleming', 'scholar_id': 'ruUKktgAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Vision Science, UC Berkeley', 'filled': False, 'name': 'Martin Banks', 'scholar_id': 'Smr99uEAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Durham University, Computer Science & Physics', 'filled': False, 'name': 'Gordon D. Love', 'scholar_id': '3xJXtlwAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of ECE, Purdue University', 'filled': False, 'name': 'Hong Z Tan', 'scholar_id': 'OiVOAHMAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Deepmind', 'filled': False, 'name': 'Ari Weinstein', 'scholar_id': 'MnUboHYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': "Brigham and Women's Hospital/Harvard Medical " 'School', 'filled': False, 'name': 'Chia-Chien Wu', 'scholar_id': 'dqokykoAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Psychology and Cognitive Science, ' 'Rutgers University', 'filled': False, 'name': 'Jacob Feldman', 'scholar_id': 'KoJrMIAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Research Scientist at Google Research, PhD ' 'Student at UC Berkeley', 'filled': False, 'name': 'Pratul Srinivasan', 'scholar_id': 'aYyDsZ0AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Formerly: Indiana University, Rutgers ' 'University, University of Pennsylvania', 'filled': False, 'name': 'Peter C. Pantelis', 'scholar_id': 'FoVvIK0AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor in Computer Science, University of ' 'California, Berkeley', 'filled': False, 'name': 'Ren Ng', 'scholar_id': '6H0mhLUAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Yale University', 'filled': False, 'name': 'Steven W Zucker', 'scholar_id': 'rNTIQXYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Brown University', 'filled': False, 'name': 'Ben Kunsberg', 'scholar_id': 'JPZWLKQAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Rutgers University, New Brunswick, NJ', 'filled': False, 'name': 'Manish Singh', 'scholar_id': '9XRvM88AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Silicon Valley Professor of ECE, Purdue ' 'University', 'filled': False, 'name': 'David S. Ebert', 'scholar_id': 'fD3JviYAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Clinical Director, Neurolens Inc.,', 'filled': False, 'name': 'Vivek Labhishetty', 'scholar_id': 'tD7OGTQAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'MIT', 'filled': False, 'name': 'Joshua B. Tenenbaum', 'scholar_id': 'rRJ9wTJMUB8C', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Chief Scientist, isee AI', 'filled': False, 'name': 'Chris Baker', 'scholar_id': 'bTdT7hAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Professor of Psychology, Ewha Womans ' 'University', 'filled': False, 'name': 'Sung-Ho Kim', 'scholar_id': 'KXQb7CAAAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Assistant Professor, Boston University', 'filled': False, 'name': 'Melissa M. Kibbe', 'scholar_id': 'NN4GKo8AAAAJ', 'source': 'CO_AUTHORS_LIST'}, {'affiliation': 'Nvidia Corporation', 'filled': False, 'name': 'Peter Shirley', 'scholar_id': 'nHx9IgYAAAAJ', 'source': 'CO_AUTHORS_LIST'}], 'email_domain': '@berkeley.edu', 'homepage': 'http://steven.cholewiak.com/', 'filled': False, 'hindex': 9, 'hindex5y': 9, 'i10index': 8, 'i10index5y': 7, 'interests': ['Depth Cues', '3D Shape', 'Shape from Texture & Shading', 'Naive Physics', 'Haptics'], 'name': 'Steven A. Cholewiak, PhD', 'scholar_id': '4bahYMkAAAAJ', 'source': 'SEARCH_AUTHOR_SNIPPETS', 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ'} """ try: sections = [section.lower() for section in sections] sections.sort(reverse=True) # Ensure 'publications' comes before 'public_access' sortby_str = '' if sortby == "year": sortby_str = '&view_op=list_works&sortby=pubdate' elif sortby != "citedby": raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'") url_citations = _CITATIONAUTH.format(author['scholar_id']) url_citations += sortby_str url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE) soup = self.nav._get_soup(url) # Update scholar_id scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0] if scholar_id != author['scholar_id']: self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. " "To avoid this warning, use %s to look up this scholar.", author['scholar_id'], scholar_id, scholar_id) author["scholar_id"] = scholar_id if sections == []: for i in self._sections: if i not in author['filled']: (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) author['filled'].append(i) else: for i in sections: if i in self._sections and i not in author['filled']: (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str)) author['filled'].append(i) except Exception as e: raise(e) return author
def __repr__(self): return self.__str__()