Source code for scholarly.author_parser

from .publication_parser import PublicationParser
import re
from .data_types import Author, AuthorSource, PublicationSource
from selenium.common.exceptions import WebDriverException

_CITATIONAUTHRE = r'user=([\w-]*)'
_HOST = 'https://scholar.google.com{0}'
_PAGESIZE = 100
_EMAILAUTHORRE = r'Verified email at '
_CITATIONAUTH = '/citations?hl=en&user={0}'
_COAUTH = ('https://scholar.google.com/citations?user={0}&hl=en'
           '#d=gsc_md_cod&u=%2Fcitations%3Fview_op%3Dlist_colleagues'
           '%26hl%3Den%26json%3D%26user%3D{0}%23t%3Dgsc_cod_lc')


[docs]class AuthorParser:
    """Returns an object for a single author"""

    def __init__(self, nav):
        self.nav = nav
        self._sections = {'basics',
                          'indices',
                          'counts',
                          'coauthors',
                          'publications'}

[docs]    def get_author(self, __data)->Author:
        """ Fills the information for an author container
        """
        author: Author = {'container_type': 'Author'}
        author['filled'] = []
        if isinstance(__data, str):
            author['scholar_id'] = __data
            author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE
        else:
            author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS
            author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0]

            pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id'])
            author['url_picture'] = _HOST.format(pic)

            name_class = self._find_tag_class_name(__data, 'h3', 'name')
            author['name'] = __data.find('h3', class_=name_class).text

            aff_class = self._find_tag_class_name(__data, 'div', 'aff')
            affiliation = __data.find('div', class_=aff_class)
            if affiliation:
                author['affiliation'] = affiliation.text

            email_class = self._find_tag_class_name(__data, 'div', 'eml')
            email = __data.find('div', class_=email_class)
            if email:
                author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text)

            int_class = self._find_tag_class_name(__data, 'a', 'one_int')
            interests = __data.find_all('a', class_=int_class)
            author['interests'] = [i.text.strip() for i in interests]

            citedby_class = self._find_tag_class_name(__data, 'div', 'cby')
            citedby = __data.find('div', class_=citedby_class)
            if citedby and citedby.text != '':
                author['citedby'] = int(citedby.text[9:])

        return author


    def _find_tag_class_name(self, __data, tag, text):
        elements = __data.find_all(tag)
        for element in elements:
            if 'class' in element.attrs and text in element.attrs['class'][0]:
                return element.attrs['class'][0]

    def _fill_basics(self, soup, author):
        author['name'] = soup.find('div', id='gsc_prf_in').text
        if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE:
            res = soup.find('img', id='gsc_prf_pup-img')
            if res != None:
                if "avatar_scholar" not in res['src']:
                    author['url_picture'] = res['src']
        author['affiliation'] = soup.find('div', class_='gsc_prf_il').text
        author['interests'] = [i.text.strip() for i in
                          soup.find_all('a', class_='gsc_prf_inta')]
        if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE:
            email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il")
            if email.text != "No verified email":
                author['email_domain'] = '@'+email.text.split(" ")[3]
        if author['source'] == AuthorSource.CO_AUTHORS_LIST:
            picture = soup.find('img', id="gsc_prf_pup-img").get('src')
            if "avatar_scholar" in picture:
                picture = _HOST.format(picture)
            author['url_picture'] = picture
        index = soup.find_all('td', class_='gsc_rsb_std')
        if index:
            author['citedby'] = int(index[0].text)

    def _fill_indices(self, soup, author):
        index = soup.find_all('td', class_='gsc_rsb_std')
        if index:
            author['citedby'] = int(index[0].text)
            author['citedby5y'] = int(index[1].text)
            author['hindex'] = int(index[2].text)
            author['hindex5y'] = int(index[3].text)
            author['i10index'] = int(index[4].text)
            author['i10index5y'] = int(index[5].text)
        else:
            author['hindex'] = 0
            author['hindex5y'] = 0
            author['i10index'] = 0
            author['i10index5y'] = 0

    def _fill_counts(self, soup, author):
        years = [int(y.text)
                 for y in soup.find_all('span', class_='gsc_g_t')]
        cites = [int(c.text)
                 for c in soup.find_all('span', class_='gsc_g_al')]
        author['cites_per_year'] = dict(zip(years, cites))

    def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''):
        author['publications'] = list()
        pubstart = 0
        url_citations = _CITATIONAUTH.format(author['scholar_id'])
        url_citations += sortby_str

        pub_parser = PublicationParser(self.nav)
        flag = False
        while True:
            for row in soup.find_all('tr', class_='gsc_a_tr'):
                new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY)
                author['publications'].append(new_pub)
                if (publication_limit) and (len(author['publications']) >= publication_limit):
                    flag = True
                    break
            if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag:
                pubstart += _PAGESIZE
                url = '{0}&cstart={1}&pagesize={2}'.format(
                    url_citations, pubstart, _PAGESIZE)
                soup = self.nav._get_soup(url)
            else:
                break

    def _get_coauthors_short(self, soup):
        """Get the short list of coauthors from the profile page.

        To be called by _fill_coauthors method.
        """
        coauthors = soup.find_all('span', class_='gsc_rsb_a_desc')
        coauthor_ids = [re.findall(_CITATIONAUTHRE,
                        coauth('a')[0].get('href'))[0]
                        for coauth in coauthors]

        coauthor_names = [coauth.find(tabindex="-1").text
                          for coauth in coauthors]
        coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text
                           for coauth in coauthors]

        return coauthor_ids, coauthor_names, coauthor_affils

    def _get_coauthors_long(self, author):
        """Get the long (>20) list of coauthors.

        Opens the dialog box to get the complete list of coauthors.
        To be called by _fill_coauthors method.
        """
        wd = self.nav.pm._get_webdriver()
        try:
            wd.get(_COAUTH.format(author['scholar_id']))
            # Wait up to 30 seconds for the various elements to be available.
            # The wait may be better set elsewhere.
            wd.implicitly_wait(30)
            coauthors = wd.find_elements_by_class_name('gs_ai_pho')
            coauthor_ids = [re.findall(_CITATIONAUTHRE,
                            coauth.get_attribute('href'))[0]
                            for coauth in coauthors]
            coauthor_names = [name.text for name in
                              wd.find_elements_by_class_name('gs_ai_name')]
            coauthor_affils = [affil.text for affil in
                               wd.find_elements_by_class_name('gs_ai_aff')]

            return coauthor_ids, coauthor_names, coauthor_affils
        finally:
            wd.quit()

    def _fill_coauthors(self, soup, author):
        # If "View All" is not found, scrape the page for coauthors
        if not soup.find_all('button', id='gsc_coauth_opn'):
            coauthor_info = self._get_coauthors_short(soup)
        else:
        # If "View All" is found, try opening the dialog box.
        # If geckodriver is not installed, resort to a short list and warn.
            try:
                coauthor_info = self._get_coauthors_long(author)
            except WebDriverException as err:
                coauthor_info = self._get_coauthors_short(soup)
                self.nav.logger.warning(err.msg)
                self.nav.logger.warning("Fetching only the top 20 coauthors")

        author['coauthors'] = []
        for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info):
            new_coauthor = self.get_author(coauth_id)
            new_coauthor['name'] = coauth_name
            new_coauthor['affiliation'] = coauth_affil
            new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST
            author['coauthors'].append(new_coauthor)

[docs]    def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0):
        """Populate the Author with information from their profile

        The `sections` argument allows for finer granularity of the profile
        information to be pulled.

        :param sections: Sections of author profile to be filled, defaults to ``[]``.

            * ``basics``: fills name, affiliation, and interests;
            * ``citations``: fills h-index, i10-index, and 5-year analogues;
            * ``counts``: fills number of citations per year;
            * ``coauthors``: fills co-authors;
            * ``publications``: fills publications;
            * ``[]``: fills all of the above
        :type sections: ['basics','citations','counts','coauthors','publications',[]] list, optional
        :param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
        :type sortby: string
        :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
        :type publication_limit: int
        :returns: The filled object if fill was successfull, False otherwise.
        :rtype: Author or bool

        :Example::

        .. testcode::

            search_query = scholarly.search_author('Steven A Cholewiak')
            author = next(search_query)
            scholarly.pprint(author.fill(sections=['basic', 'citation_indices', 'co-authors']))

        :Output::

        .. testoutput::

            {'affiliation': 'Vision Scientist',
             'citedby': 304,
             'citedby5y': 226,
             'coauthors': [{'affiliation': 'Kurt Koffka Professor of Experimental '
                                           'Psychology, University of Giessen',
                            'filled': False,
                            'name': 'Roland Fleming',
                            'scholar_id': 'ruUKktgAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Vision Science, UC Berkeley',
                            'filled': False,
                            'name': 'Martin Banks',
                            'scholar_id': 'Smr99uEAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Durham University, Computer Science & Physics',
                            'filled': False,
                            'name': 'Gordon D. Love',
                            'scholar_id': '3xJXtlwAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of ECE, Purdue University',
                            'filled': False,
                            'name': 'Hong Z Tan',
                            'scholar_id': 'OiVOAHMAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Deepmind',
                            'filled': False,
                            'name': 'Ari Weinstein',
                            'scholar_id': 'MnUboHYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': "Brigham and Women's Hospital/Harvard Medical "
                                           'School',
                            'filled': False,
                            'name': 'Chia-Chien Wu',
                            'scholar_id': 'dqokykoAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Psychology and Cognitive Science, '
                                           'Rutgers University',
                            'filled': False,
                            'name': 'Jacob Feldman',
                            'scholar_id': 'KoJrMIAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Research Scientist at Google Research, PhD '
                                           'Student at UC Berkeley',
                            'filled': False,
                            'name': 'Pratul Srinivasan',
                            'scholar_id': 'aYyDsZ0AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Formerly: Indiana University, Rutgers '
                                           'University, University of Pennsylvania',
                            'filled': False,
                            'name': 'Peter C. Pantelis',
                            'scholar_id': 'FoVvIK0AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor in Computer Science, University of '
                                           'California, Berkeley',
                            'filled': False,
                            'name': 'Ren Ng',
                            'scholar_id': '6H0mhLUAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Yale University',
                            'filled': False,
                            'name': 'Steven W Zucker',
                            'scholar_id': 'rNTIQXYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Brown University',
                            'filled': False,
                            'name': 'Ben Kunsberg',
                            'scholar_id': 'JPZWLKQAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Rutgers University, New Brunswick, NJ',
                            'filled': False,
                            'name': 'Manish Singh',
                            'scholar_id': '9XRvM88AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Silicon Valley Professor of ECE, Purdue '
                                           'University',
                            'filled': False,
                            'name': 'David S. Ebert',
                            'scholar_id': 'fD3JviYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Clinical Director, Neurolens Inc.,',
                            'filled': False,
                            'name': 'Vivek Labhishetty',
                            'scholar_id': 'tD7OGTQAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'MIT',
                            'filled': False,
                            'name': 'Joshua B. Tenenbaum',
                            'scholar_id': 'rRJ9wTJMUB8C',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Chief Scientist, isee AI',
                            'filled': False,
                            'name': 'Chris Baker',
                            'scholar_id': 'bTdT7hAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Psychology, Ewha Womans '
                                           'University',
                            'filled': False,
                            'name': 'Sung-Ho Kim',
                            'scholar_id': 'KXQb7CAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Assistant Professor, Boston University',
                            'filled': False,
                            'name': 'Melissa M. Kibbe',
                            'scholar_id': 'NN4GKo8AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Nvidia Corporation',
                            'filled': False,
                            'name': 'Peter Shirley',
                            'scholar_id': 'nHx9IgYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'}],
             'email_domain': '@berkeley.edu',
             'filled': False,
             'hindex': 9,
             'hindex5y': 9,
             'i10index': 8,
             'i10index5y': 7,
             'interests': ['Depth Cues',
                           '3D Shape',
                           'Shape from Texture & Shading',
                           'Naive Physics',
                           'Haptics'],
             'name': 'Steven A. Cholewiak, PhD',
             'scholar_id': '4bahYMkAAAAJ',
             'source': 'SEARCH_AUTHOR_SNIPPETS',
             'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ'}
        """
        try:
            sections = [section.lower() for section in sections]
            sortby_str = ''
            if sortby == "year":
                sortby_str = '&view_op=list_works&sortby=pubdate'
            elif sortby != "citedby":
                raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'")
            url_citations = _CITATIONAUTH.format(author['scholar_id'])
            url_citations += sortby_str
            url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
            soup = self.nav._get_soup(url)

            if sections == []:
                for i in self._sections:
                    if i not in author['filled']:
                        (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str))
                        author['filled'].append(i)
            else:
                for i in sections:
                    if i in self._sections and i not in author['filled']:
                        (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str))
                        author['filled'].append(i)
        except Exception as e:
            raise(e)

        return author


    def __repr__(self):
        return self.__str__()