Source code for scholarly.publication_parser

import re
import bibtexparser
import arrow
from bibtexparser.bibdatabase import BibDatabase
from .data_types import BibEntry, Mandate, Publication, PublicationSource

_SCHOLARPUBRE = r'cites=([\d,]*)'
_CITATIONPUB = '/citations?hl=en&view_op=view_citation&citation_for_view={0}'
_SCHOLARPUB = '/scholar?hl=en&oi=bibs&cites={0}'
_CITATIONPUBRE = r'citation_for_view=([\w-]*:[\w-]*)'
_BIBCITE = '/scholar?hl=en&q=info:{0}\
_CITEDBYLINK = '/scholar?hl=en&cites={0}'
_MANDATES_URL = '/citations?view_op=view_mandate&hl=en&citation_for_view={0}'

    'ENTRYTYPE': 'pub_type',
    'ID': 'bib_id',
    'year': 'pub_year',

    'number': 'str',
    'volume': 'str',
    'pub_type': 'ENTRYTYPE',
    'bib_id': 'ID',

[docs]def remap_bib(parsed_bib: dict, mapping: dict, data_types:dict ={}) -> BibEntry: for key, value in mapping.items(): if key in parsed_bib: parsed_bib[value] = parsed_bib.pop(key) for key, value in data_types.items(): if key in parsed_bib: if value == 'int': parsed_bib[key] = int(parsed_bib[key]) return parsed_bib
class _SearchScholarIterator(object): """Iterator that returns Publication objects from the search page I have removed all logging from here for simplicity. -V """ def __init__(self, nav, url: str): self._url = url self._pubtype = PublicationSource.PUBLICATION_SEARCH_SNIPPET if "/scholar?" in url else PublicationSource.JOURNAL_CITATION_LIST self._nav = nav self._load_url(url) self.total_results = self._get_total_results() self.pub_parser = PublicationParser(self._nav) def _load_url(self, url: str): # this is temporary until setup json file self._soup = self._nav._get_soup(url) self._pos = 0 self._rows = self._soup.find_all('div', class_='gs_r gs_or gs_scl') + self._soup.find_all('div', class_='gsc_mpat_ttl') def _get_total_results(self): if self._soup.find("div", class_="gs_pda"): return None for x in self._soup.find_all('div', class_='gs_ab_mdw'): # Accounting for different thousands separators: # comma, dot, space, apostrophe match = re.match(pattern=r'(^|\s*About)\s*([0-9,\.\s’]+)', string=x.text) if match: return int(re.sub(pattern=r'[,\.\s’]',repl='', return 0 # Iterator protocol def __iter__(self): return self def __next__(self): if self._pos < len(self._rows): row = self._rows[self._pos] self._pos += 1 res = self.pub_parser.get_publication(row, self._pubtype) return res elif self._soup.find(class_='gs_ico gs_ico_nav_next'): url = self._soup.find( class_='gs_ico gs_ico_nav_next').parent['href'] self._url = url self._load_url(url) return self.__next__() else: raise StopIteration # Pickle protocol def __getstate__(self): return {'url': self._url, 'pos': self._pos} def __setstate__(self, state): # this needs validation -V self._load_url(state['url']) self._pos = state['pos']
[docs]class PublicationParser(object): """Returns an object for a single publication""" def __init__(self, nav): self.nav = nav def _citation_pub(self, __data, publication: Publication): # create the bib entry in the dictionary publication['bib']['title'] = __data.find('a', class_='gsc_a_at').text publication['author_pub_id'] = re.findall(_CITATIONPUBRE, __data.find( 'a', class_='gsc_a_at')['href'])[0] citedby = __data.find(class_='gsc_a_ac') publication["num_citations"] = 0 if citedby and not (citedby.text.isspace() or citedby.text == ''): publication["num_citations"] = int(citedby.text.strip()) publication["citedby_url"] = citedby["href"] publication["cites_id"] = re.findall(_SCHOLARPUBRE, citedby["href"])[0].split(',') year = __data.find(class_='gsc_a_h') if (year and year.text and not year.text.isspace() and len(year.text) > 0): publication['bib']['pub_year'] = year.text.strip() author_citation = __data.find_all('div', class_='gs_gray') try: citation = author_citation[1].text except IndexError: citation = "" publication['bib']['citation'] = citation return publication
[docs] def get_publication(self, __data, pubtype: PublicationSource)->Publication: """Returns a publication that has either 'citation' or 'scholar' source """ publication: Publication = {'container_type': 'Publication'} publication['source'] = pubtype publication['bib'] = {} publication['filled'] = False if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: return self._citation_pub(__data, publication) elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: return self._scholar_pub(__data, publication) elif publication['source'] == PublicationSource.JOURNAL_CITATION_LIST: return publication # TODO: self._journal_pub(__data, publication) else: return publication
def _get_authorlist(self, authorinfo): authorlist = list() text = authorinfo.split(' - ')[0] for i in text.split(','): i = i.strip() if bool('\d', i)): continue if ("Proceedings" in i or "Conference" in i or "Journal" in i or "(" in i or ")" in i or "[" in i or "]" in i or "Transactions" in i): continue i = i.replace("…", "") authorlist.append(i) return authorlist def _get_author_id_list(self, authorinfo_inner_html): author_id_list = list() html = authorinfo_inner_html.split(' - ')[0] for author_html in html.split(','): author_html = author_html.strip() match ='\\?user=(.*?)&amp;', author_html) if match: author_id_list.append(match.groups()[0]) else: author_id_list.append("") return author_id_list def _scholar_pub(self, __data, publication: Publication): databox = __data.find('div', class_='gs_ri') title = databox.find('h3', class_='gs_rt') cid = __data.get('data-cid') pos = __data.get('data-rp') publication['gsrank'] = int(pos) + 1 if title.find('span', class_='gs_ctu'): # A citation title.span.extract() elif title.find('span', class_='gs_ctc'): # A book or PDF title.span.extract() publication['bib']['title'] = title.text.strip() if title.find('a'): publication['pub_url'] = title.find('a')['href'] author_div_element = databox.find('div', class_='gs_a') authorinfo = author_div_element.text authorinfo = authorinfo.replace(u'\xa0', u' ') # NBSP authorinfo = authorinfo.replace(u'&amp;', u'&') # Ampersand publication['bib']["author"] = self._get_authorlist(authorinfo) authorinfo_html = author_div_element.decode_contents() publication["author_id"] = self._get_author_id_list(authorinfo_html) # There are 4 (known) patterns in the author/venue/year/host line: # (A) authors - host # (B) authors - venue, year - host # (C) authors - venue - host # (D) authors - year - host # The authors are handled above so below is only concerned with # the middle venue/year part. In principle the venue is separated # from the year by a comma. However, there exist venues with commas # and as shown above there might not always be a venue AND a year... venueyear = authorinfo.split(' - ') # If there is no middle part (A) then venue and year are unknown. if len(venueyear) <= 2: publication['bib']['venue'], publication['bib']['pub_year'] = 'NA', 'NA' else: venueyear = venueyear[1].split(',') venue = 'NA' year = venueyear[-1].strip() if year.isnumeric() and len(year) == 4: publication['bib']['pub_year'] = year if len(venueyear) >= 2: venue = ','.join(venueyear[0:-1]) # everything but last else: venue = ','.join(venueyear) # everything publication['bib']['pub_year'] = 'NA' publication['bib']['venue'] = venue if databox.find('div', class_='gs_rs'): publication['bib']['abstract'] = databox.find('div', class_='gs_rs').text publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\u2026', u'') publication['bib']['abstract'] = publication['bib']['abstract'].replace(u'\n', u' ') publication['bib']['abstract'] = publication['bib']['abstract'].strip() if publication['bib']['abstract'][0:8].lower() == 'abstract': publication['bib']['abstract'] = publication['bib']['abstract'][9:].strip() publication['url_scholarbib'] = _BIBCITE.format(cid, pos) sclib = self.nav.publib.format(id=cid) publication['url_add_sclib'] = sclib lowerlinks = databox.find('div', class_='gs_fl').find_all('a') publication["num_citations"] = 0 for link in lowerlinks: if 'Cited by' in link.text: publication['num_citations'] = int(re.findall(r'\d+', link.text)[0].strip()) publication['citedby_url'] = link['href'] if 'Related articles' in link.text: publication['url_related_articles'] = link['href'] if __data.find('div', class_='gs_ggs gs_fl'): publication['eprint_url'] = __data.find( 'div', class_='gs_ggs gs_fl').a['href'] return publication
[docs] def fill(self, publication: Publication)->Publication: """Populate the Publication with information from its profile :param publication: Scholar or Citation publication container object that is not filled :type publication: PublicationCitation or PublicationScholar """ if publication['source'] == PublicationSource.AUTHOR_PUBLICATION_ENTRY: url = _CITATIONPUB.format(publication['author_pub_id']) soup = self.nav._get_soup(url) publication['bib']['title'] = soup.find('div', id='gsc_oci_title').text if publication['bib']['title'][-1] == '\u2026': merged_snippet = soup.find('div', class_='gsc_oci_merged_snippet') if merged_snippet: title_div = merged_snippet.find('div') if title_div: publication['bib']['title'] = title_div.text if soup.find('a', class_='gsc_oci_title_link'): publication['pub_url'] = soup.find( 'a', class_='gsc_oci_title_link')['href'] for item in soup.find_all('div', class_='gs_scl'): key = item.find(class_='gsc_oci_field').text.strip().lower() val = item.find(class_='gsc_oci_value') if key == 'authors' or key == 'inventors': publication['bib']['author'] = ' and '.join( [i.strip() for i in val.text.split(',')]) elif key == 'journal': publication['bib']['journal'] = val.text elif key == 'conference': publication['bib']['conference'] = val.text elif key == 'volume': publication['bib']['volume'] = val.text elif key == 'issue': publication['bib']['number'] = val.text elif key == 'pages': publication['bib']['pages'] = val.text elif key == 'publisher': publication['bib']['publisher'] = val.text elif key == 'publication date': patterns = ['YYYY/M', 'YYYY/MM/DD', 'YYYY', 'YYYY/M/DD', 'YYYY/M/D', 'YYYY/MM/D'] publication['bib']['pub_year'] = arrow.get(val.text, patterns).year elif key == 'description': # try to find all the gsh_csp if they exist abstract = val.find_all(class_='gsh_csp') result = "" # append all gsh_csp together as there can be multiple in certain scenarios for item in abstract: if item.text[0:8].lower() == 'abstract': result += item.text[9:].strip() else: result += item.text if len(abstract) == 0: # if no gsh_csp were found abstract = val.find(class_='gsh_small') if abstract: if abstract.text[0:8].lower() == 'abstract': result = abstract.text[9:].strip() else: result = abstract.text else: result = ' '.join([description_part for description_part in val]) publication['bib']['abstract'] = result elif key == 'total citations': publication['cites_id'] = re.findall( _SCHOLARPUBRE, val.a['href'])[0].split(',') publication['citedby_url'] = _CITEDBYLINK.format(','.join(publication['cites_id'])) elif key == 'scholar articles': for entry in val.find_all('a'): if entry.text.lower() == 'related articles': publication['url_related_articles'] = entry.get('href')[26:] break # number of citation per year years = [int(y.text) for y in soup.find_all(class_='gsc_oci_g_t')] cites = [int(c.text) for c in soup.find_all(class_='gsc_oci_g_al')] cites_year = [int(c.get('href')[-4:]) for c in soup.find_all(class_='gsc_oci_g_a')] nonzero_cites_per_year = dict(zip(cites_year, cites)) res_dict = {} for year in years: res_dict[year] = (nonzero_cites_per_year[year] if year in nonzero_cites_per_year else 0) publication['cites_per_year'] = res_dict if soup.find('div', class_='gsc_vcd_title_ggi'): publication['eprint_url'] = soup.find( 'div', class_='gsc_vcd_title_ggi').a['href'] if publication.get('public_access', None): publication['mandates'] = [] self._fill_public_access_mandates(publication) publication['filled'] = True elif publication['source'] == PublicationSource.PUBLICATION_SEARCH_SNIPPET: bibtex_url = self._get_bibtex(publication['url_scholarbib']) bibtex = self.nav._get_page(bibtex_url) parser = bibtexparser.bparser.BibTexParser(common_strings=True) parsed_bib = remap_bib(bibtexparser.loads(bibtex,parser).entries[-1], _BIB_MAPPING, _BIB_DATATYPES) publication['bib'].update(parsed_bib) publication['filled'] = True return publication
[docs] def citedby(self, publication: Publication) -> _SearchScholarIterator or list: """Searches Google Scholar for other articles that cite this Publication and returns a Publication generator. :param publication: Scholar or Citation publication container object :type publication: Publication :getter: Returns a Generator of Publications that cited the current. :type: Iterator[:class:`Publication`] """ if not publication['filled']: publication = self.fill(publication) return _SearchScholarIterator(self.nav, publication['citedby_url'])
[docs] def bibtex(self, publication: Publication) -> str: """Returns the publication as a Bibtex entry :param publication: Scholar or Citation publication container object :type publication: Publication :getter: Returns a Bibtex entry in text format :type: str """ if not publication['filled']: publication = self.fill(publication) a = BibDatabase() converted_dict = publication['bib'] converted_dict = remap_bib(converted_dict, _BIB_REVERSE_MAPPING) str_dict = {key: str(value) for key, value in converted_dict.items()} # convert every key of the dictionary to string to be Bibtex compatible a.entries = [str_dict] return bibtexparser.dumps(a)
def _get_bibtex(self, bib_url) -> str: """Retrieves the bibtex url""" soup = self.nav._get_soup(bib_url) styles = soup.find_all('a', class_='gs_citi') for link in styles: if link.string.lower() == "bibtex": return link.get('href') return '' def _fill_public_access_mandates(self, publication: Publication) -> None: """Fills the public access mandates""" if publication.get('public_access', None): soup = self.nav._get_soup(_MANDATES_URL.format(publication['author_pub_id'])) mandates = soup.find_all('li') for mandate in mandates: m = Mandate() m['agency'] = mandate.find('span', class_='gsc_md_mndt_name').text m['url_policy'] = mandate.find('div', class_='gsc_md_mndt_title').a['href'] m['url_policy_cached'] = mandate.find('span', class_='gs_a').a['href'] for desc in mandate.find_all('div', class_='gsc_md_mndt_desc'): match ="Effective date: [0-9]{4}/[0-9]{1,2}", desc.text) if match: m['effective_date'] = re.sub(pattern="Effective date: ", repl="", string=desc.text[match.start() : match.end()]) match ="Embargo: ", desc.text) if match: m['embargo'] = re.sub(pattern="Embargo: ", repl="", string=desc.text[match.end():]) if "Grant: " in desc.text: m['grant'] = desc.text.split("Grant: ")[1] if "Funding acknowledgment" in desc.text: m['acknowledgement'] = desc.find('span', class_='gs_gray').text publication['mandates'].append(m)