Source code for scholarly.author_parser

from .publication_parser import PublicationParser
import re
from .data_types import Author, AuthorSource, PublicationSource, PublicAccess
import codecs

_CITATIONAUTHRE = r'user=([\w-]*)'
_HOST = 'https://scholar.google.com{0}'
_PAGESIZE = 100
_EMAILAUTHORRE = r'Verified email at '
_CITATIONAUTH = '/citations?hl=en&user={0}'
_COAUTH = '/citations?view_op=list_colleagues&hl=en&user={0}'
_MANDATES = "/citations?hl=en&tzom=300&user={0}&view_op=list_mandates&pagesize={1}"


[docs]class AuthorParser:
    """Returns an object for a single author"""

    def __init__(self, nav):
        self.nav = nav
        self._sections = ['basics',
                          'indices',
                          'counts',
                          'coauthors',
                          'publications',
                          'public_access']

[docs]    def get_author(self, __data)->Author:
        """ Fills the information for an author container
        """
        author: Author = {'container_type': 'Author'}
        author['filled'] = []
        if isinstance(__data, str):
            author['scholar_id'] = __data
            author['source'] = AuthorSource.AUTHOR_PROFILE_PAGE
        else:
            author['source'] = AuthorSource.SEARCH_AUTHOR_SNIPPETS
            author['scholar_id'] = re.findall(_CITATIONAUTHRE, __data('a')[0]['href'])[0]

            pic = '/citations?view_op=medium_photo&user={}'.format(author['scholar_id'])
            author['url_picture'] = _HOST.format(pic)

            name_class = self._find_tag_class_name(__data, 'h3', 'name')
            author['name'] = __data.find('h3', class_=name_class).text

            aff_class = self._find_tag_class_name(__data, 'div', 'aff')
            affiliation = __data.find('div', class_=aff_class)
            if affiliation:
                author['affiliation'] = affiliation.text

            email_class = self._find_tag_class_name(__data, 'div', 'eml')
            email = __data.find('div', class_=email_class)
            if email:
                author['email_domain'] = re.sub(_EMAILAUTHORRE, r'@', email.text)

            int_class = self._find_tag_class_name(__data, 'a', 'one_int')
            if int_class:
                interests = __data.find_all('a', class_=int_class)
                author['interests'] = [i.text.strip() for i in interests]
            else:
                author['interests'] = []

            citedby_class = self._find_tag_class_name(__data, 'div', 'cby')
            citedby = __data.find('div', class_=citedby_class)
            if citedby and citedby.text != '':
                author['citedby'] = int(citedby.text[9:])

        return author


    def _find_tag_class_name(self, __data, tag, text):
        elements = __data.find_all(tag)
        for element in elements:
            if 'class' in element.attrs and text in element.attrs['class'][0]:
                return element.attrs['class'][0]

    def _fill_basics(self, soup, author):
        author['name'] = soup.find('div', id='gsc_prf_in').text
        if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE:
            res = soup.find('img', id='gsc_prf_pup-img')
            if res is not None:
                if "avatar_scholar" not in res['src']:
                    author['url_picture'] = res['src']
        elif author['source'] == AuthorSource.CO_AUTHORS_LIST:
            picture = soup.find('img', id="gsc_prf_pup-img").get('src')
            if "avatar_scholar" in picture:
                picture = _HOST.format(picture)
            author['url_picture'] = picture

        affiliation = soup.find('div', class_='gsc_prf_il')
        author['affiliation'] = affiliation.text
        affiliation_link = affiliation.find('a')
        if affiliation_link:
            author['organization'] = int(affiliation_link.get('href').split("org=")[-1])
        author['interests'] = [i.text.strip() for i in
                          soup.find_all('a', class_='gsc_prf_inta')]
        email = soup.find('div', id="gsc_prf_ivh", class_="gsc_prf_il")
        if author['source'] == AuthorSource.AUTHOR_PROFILE_PAGE:
            if email.text != "No verified email":
                author['email_domain'] = '@'+email.text.split(" ")[3]
        homepage = email.find('a', class_="gsc_prf_ila")
        if homepage:
            author['homepage'] = homepage.get('href')

        index = soup.find_all('td', class_='gsc_rsb_std')
        if index:
            author['citedby'] = int(index[0].text)

    def _fill_indices(self, soup, author):
        index = soup.find_all('td', class_='gsc_rsb_std')
        if index:
            author['citedby'] = int(index[0].text)
            author['citedby5y'] = int(index[1].text)
            author['hindex'] = int(index[2].text)
            author['hindex5y'] = int(index[3].text)
            author['i10index'] = int(index[4].text)
            author['i10index5y'] = int(index[5].text)
        else:
            author['hindex'] = 0
            author['hindex5y'] = 0
            author['i10index'] = 0
            author['i10index5y'] = 0

    def _fill_counts(self, soup, author):
        years = [int(y.text)
                 for y in soup.find_all('span', class_='gsc_g_t')]

        cites = [0]*len(years)
        for c in soup.find_all('a', class_='gsc_g_a'):
            i = int(c['style'].split(':')[-1])
            cites[-i] = int(c.find('span', class_='gsc_g_al').text)

        author['cites_per_year'] = dict(zip(years, cites))

    def _fill_public_access(self, soup, author):
        available = soup.find('div', class_='gsc_rsb_m_a')
        not_available = soup.find('div', class_='gsc_rsb_m_na')
        n_available, n_not_available = 0, 0
        if available:
            n_available = int(re.sub("[.,]", "", available.text.split(" ")[0]))
        if not_available:
            n_not_available = int(re.sub("[.,]", "", not_available.text.split(" ")[0]))

        author["public_access"] = PublicAccess(available=n_available,
                                               not_available=n_not_available)

        if 'publications' not in author['filled']:
            return

        # Make a dictionary mapping to the publications
        publications = {pub['author_pub_id']:pub for pub in author['publications']}
        soup = self.nav._get_soup(_MANDATES.format(author['scholar_id'], _PAGESIZE))
        while True:
            rows = soup.find_all('div', 'gsc_mnd_sec_na')
            if rows:
                for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'):
                    author_pub_id = re.findall(r"citation_for_view=([\w:-]*)",
                                               row['data-href'])[0]
                    publications[author_pub_id]["public_access"] = False

            rows = soup.find_all('div', 'gsc_mnd_sec_avl')
            if rows:
                for row in rows[0].find_all('a', 'gsc_mnd_art_rvw gs_nph gsc_mnd_link_font'):
                    author_pub_id = re.findall(r"citation_for_view=([\w:-]*)",
                                               row['data-href'])[0]
                    publications[author_pub_id]["public_access"] = True

            next_button = soup.find(class_="gs_btnPR")
            if next_button and "disabled" not in next_button.attrs:
                url = next_button['onclick'][17:-1]
                url = codecs.getdecoder("unicode_escape")(url)[0]
                soup = self.nav._get_soup(url)
            else:
                break


    def _fill_publications(self, soup, author, publication_limit: int = 0, sortby_str: str = ''):
        author['publications'] = list()
        pubstart = 0
        url_citations = _CITATIONAUTH.format(author['scholar_id'])
        url_citations += sortby_str

        pub_parser = PublicationParser(self.nav)
        flag = False
        while True:
            for row in soup.find_all('tr', class_='gsc_a_tr'):
                new_pub = pub_parser.get_publication(row, PublicationSource.AUTHOR_PUBLICATION_ENTRY)
                author['publications'].append(new_pub)
                if (publication_limit) and (len(author['publications']) >= publication_limit):
                    flag = True
                    break
            if 'disabled' not in soup.find('button', id='gsc_bpf_more').attrs and not flag:
                pubstart += _PAGESIZE
                url = '{0}&cstart={1}&pagesize={2}'.format(
                    url_citations, pubstart, _PAGESIZE)
                soup = self.nav._get_soup(url)
            else:
                break

    def _get_coauthors_short(self, soup):
        """Get the short list of coauthors from the profile page.

        This method fetches the list of coauthors visible from an author's
        prilfe page alone. This may or may not be the complete list of
        coauthors.

        Note:
        -----
        This method is to be called by _fill_coauthors method.
        """
        coauthors = soup.find_all('span', class_='gsc_rsb_a_desc')
        coauthor_ids = [re.findall(_CITATIONAUTHRE,
                        coauth('a')[0].get('href'))[0]
                        for coauth in coauthors]

        coauthor_names = [coauth.find(tabindex="-1").text
                          for coauth in coauthors]
        coauthor_affils = [coauth.find(class_="gsc_rsb_a_ext").text
                           for coauth in coauthors]

        return coauthor_ids, coauthor_names, coauthor_affils

    def _get_coauthors_long(self, author):
        """Get the long (>20) list of coauthors.

        This method fetches the complete list of coauthors bu opening a new
        page filled with the complete coauthor list.

        Note:
        -----
        This method is to be called by _fill_coauthors method.
        """
        soup = self.nav._get_soup(_COAUTH.format(author['scholar_id']))
        coauthors = soup.find_all('div', 'gs_ai gs_scl')
        coauthor_ids = [re.findall(_CITATIONAUTHRE,
                        coauth('a')[0].get('href'))[0]
                        for coauth in coauthors]

        coauthor_names = [coauth.find(class_="gs_ai_name").text for coauth in coauthors]
        coauthor_affils = [coauth.find(class_="gs_ai_aff").text
                           for coauth in coauthors]

        return coauthor_ids, coauthor_names, coauthor_affils

    def _fill_coauthors(self, soup, author):
        # If "View All" is not found, scrape the page for coauthors
        if not soup.find_all('button', id='gsc_coauth_opn'):
            coauthor_info = self._get_coauthors_short(soup)
        else:
        # If "View All" is found, try opening the dialog box.
        # If geckodriver is not installed, resort to a short list and warn.
            try:
                coauthor_info = self._get_coauthors_long(author)
            except Exception as err:
                coauthor_info = self._get_coauthors_short(soup)
                self.nav.logger.warning(err)
                self.nav.logger.warning("Fetching only the top 20 coauthors")

        author['coauthors'] = []
        for coauth_id, coauth_name, coauth_affil in zip(*coauthor_info):
            new_coauthor = self.get_author(coauth_id)
            new_coauthor['name'] = coauth_name
            new_coauthor['affiliation'] = coauth_affil
            new_coauthor['source'] = AuthorSource.CO_AUTHORS_LIST
            author['coauthors'].append(new_coauthor)

[docs]    def fill(self, author, sections: list = [], sortby="citedby", publication_limit: int = 0):
        """Populate the Author with information from their profile

        The `sections` argument allows for finer granularity of the profile
        information to be pulled.

        :param sections: Sections of author profile to be filled, defaults to ``[]``.

            * ``basics``: fills name, affiliation, and interests;
            * ``citations``: fills h-index, i10-index, and 5-year analogues;
            * ``counts``: fills number of citations per year;
            * ``public_access``: fills number of articles with public access mandates;
            * ``coauthors``: fills co-authors;
            * ``publications``: fills publications;
            * ``[]``: fills all of the above
        :type sections: ['basics','citations','counts','public_access','coauthors','publications',[]] list, optional
        :param sortby: Select the order of the citations in the author page. Either by 'citedby' or 'year'. Defaults to 'citedby'.
        :type sortby: string
        :param publication_limit: Select the max number of publications you want you want to fill for the author. Defaults to no limit.
        :type publication_limit: int
        :returns: The filled object if fill was successfull, False otherwise.
        :rtype: Author or bool

        :Example::

        .. testcode::

            search_query = scholarly.search_author('Steven A Cholewiak')
            author = next(search_query)
            author = scholarly.fill(author, sections=['basics', 'citations', 'coauthors'])
            scholarly.pprint(author)

        :Output::

        .. testoutput::

            {'affiliation': 'Vision Scientist',
             'citedby': 304,
             'citedby5y': 226,
             'coauthors': [{'affiliation': 'Kurt Koffka Professor of Experimental '
                                           'Psychology, University of Giessen',
                            'filled': False,
                            'name': 'Roland Fleming',
                            'scholar_id': 'ruUKktgAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Vision Science, UC Berkeley',
                            'filled': False,
                            'name': 'Martin Banks',
                            'scholar_id': 'Smr99uEAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Durham University, Computer Science & Physics',
                            'filled': False,
                            'name': 'Gordon D. Love',
                            'scholar_id': '3xJXtlwAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of ECE, Purdue University',
                            'filled': False,
                            'name': 'Hong Z Tan',
                            'scholar_id': 'OiVOAHMAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Deepmind',
                            'filled': False,
                            'name': 'Ari Weinstein',
                            'scholar_id': 'MnUboHYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': "Brigham and Women's Hospital/Harvard Medical "
                                           'School',
                            'filled': False,
                            'name': 'Chia-Chien Wu',
                            'scholar_id': 'dqokykoAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Psychology and Cognitive Science, '
                                           'Rutgers University',
                            'filled': False,
                            'name': 'Jacob Feldman',
                            'scholar_id': 'KoJrMIAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Research Scientist at Google Research, PhD '
                                           'Student at UC Berkeley',
                            'filled': False,
                            'name': 'Pratul Srinivasan',
                            'scholar_id': 'aYyDsZ0AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Formerly: Indiana University, Rutgers '
                                           'University, University of Pennsylvania',
                            'filled': False,
                            'name': 'Peter C. Pantelis',
                            'scholar_id': 'FoVvIK0AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor in Computer Science, University of '
                                           'California, Berkeley',
                            'filled': False,
                            'name': 'Ren Ng',
                            'scholar_id': '6H0mhLUAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Yale University',
                            'filled': False,
                            'name': 'Steven W Zucker',
                            'scholar_id': 'rNTIQXYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Brown University',
                            'filled': False,
                            'name': 'Ben Kunsberg',
                            'scholar_id': 'JPZWLKQAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Rutgers University, New Brunswick, NJ',
                            'filled': False,
                            'name': 'Manish Singh',
                            'scholar_id': '9XRvM88AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Silicon Valley Professor of ECE, Purdue '
                                           'University',
                            'filled': False,
                            'name': 'David S. Ebert',
                            'scholar_id': 'fD3JviYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Clinical Director, Neurolens Inc.,',
                            'filled': False,
                            'name': 'Vivek Labhishetty',
                            'scholar_id': 'tD7OGTQAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'MIT',
                            'filled': False,
                            'name': 'Joshua B. Tenenbaum',
                            'scholar_id': 'rRJ9wTJMUB8C',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Chief Scientist, isee AI',
                            'filled': False,
                            'name': 'Chris Baker',
                            'scholar_id': 'bTdT7hAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Professor of Psychology, Ewha Womans '
                                           'University',
                            'filled': False,
                            'name': 'Sung-Ho Kim',
                            'scholar_id': 'KXQb7CAAAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Assistant Professor, Boston University',
                            'filled': False,
                            'name': 'Melissa M. Kibbe',
                            'scholar_id': 'NN4GKo8AAAAJ',
                            'source': 'CO_AUTHORS_LIST'},
                           {'affiliation': 'Nvidia Corporation',
                            'filled': False,
                            'name': 'Peter Shirley',
                            'scholar_id': 'nHx9IgYAAAAJ',
                            'source': 'CO_AUTHORS_LIST'}],
             'email_domain': '@berkeley.edu',
             'homepage': 'http://steven.cholewiak.com/',
             'filled': False,
             'hindex': 9,
             'hindex5y': 9,
             'i10index': 8,
             'i10index5y': 7,
             'interests': ['Depth Cues',
                           '3D Shape',
                           'Shape from Texture & Shading',
                           'Naive Physics',
                           'Haptics'],
             'name': 'Steven A. Cholewiak, PhD',
             'scholar_id': '4bahYMkAAAAJ',
             'source': 'SEARCH_AUTHOR_SNIPPETS',
             'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ'}
        """
        try:
            sections = [section.lower() for section in sections]
            sections.sort(reverse=True)  # Ensure 'publications' comes before 'public_access'
            sortby_str = ''
            if sortby == "year":
                sortby_str = '&view_op=list_works&sortby=pubdate'
            elif sortby != "citedby":
                raise Exception("Please enter a valid sortby parameter. Options: 'year', 'citedby'")
            url_citations = _CITATIONAUTH.format(author['scholar_id'])
            url_citations += sortby_str
            url = '{0}&pagesize={1}'.format(url_citations, _PAGESIZE)
            soup = self.nav._get_soup(url)

            # Update scholar_id
            scholar_id = re.findall(_CITATIONAUTHRE, soup.find("link", rel="canonical").get('href', ""))[0]
            if scholar_id != author['scholar_id']:
                self.nav.logger.warning("Changing the scholar_id following redirect from %s to %s. "
                                        "To avoid this warning, use %s to look up this scholar.",
                                        author['scholar_id'], scholar_id, scholar_id)
                author["scholar_id"] = scholar_id

            if sections == []:
                for i in self._sections:
                    if i not in author['filled']:
                        (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str))
                        author['filled'].append(i)
            else:
                for i in sections:
                    if i in self._sections and i not in author['filled']:
                        (getattr(self, f'_fill_{i}')(soup, author) if i != 'publications' else getattr(self, f'_fill_{i}')(soup, author, publication_limit, sortby_str))
                        author['filled'].append(i)
        except Exception as e:
            raise(e)

        return author


    def __repr__(self):
        return self.__str__()