-
Notifications
You must be signed in to change notification settings - Fork 749
Add new unpaywall provider #310
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
from __future__ import annotations | ||
|
||
import os | ||
from datetime import datetime | ||
from http import HTTPStatus | ||
from urllib.parse import quote | ||
|
||
import aiohttp | ||
from pydantic import BaseModel, ConfigDict, ValidationError | ||
|
||
from ..types import DocDetails | ||
from ..utils import ( | ||
_get_with_retrying, | ||
strings_similarity, | ||
) | ||
from .client_models import DOIOrTitleBasedProvider, DOIQuery, TitleAuthorQuery | ||
from .exceptions import DOINotFoundError | ||
|
||
UNPAYWALL_BASE_URL = "https://api.unpaywall.org/v2/" | ||
UNPAYWALL_TIMEOUT = float(os.environ.get("UNPAYWALL_TIMEOUT", "10.0")) # seconds | ||
|
||
|
||
class Author(BaseModel): | ||
family: str | None = None | ||
given: str | None = None | ||
sequence: str | None = None | ||
affiliation: list[dict[str, str]] | None = None | ||
model_config = ConfigDict(extra="allow") | ||
|
||
|
||
class BestOaLocation(BaseModel): | ||
updated: datetime | None = None | ||
url: str | None = None | ||
url_for_pdf: str | None = None | ||
url_for_landing_page: str | None = None | ||
evidence: str | None = None | ||
license: str | None = None | ||
version: str | None = None | ||
host_type: str | None = None | ||
is_best: bool | None = None | ||
pmh_id: str | None = None | ||
endpoint_id: str | None = None | ||
repository_institution: str | None = None | ||
oa_date: str | None = None | ||
model_config = ConfigDict(extra="allow") | ||
|
||
|
||
class UnpaywallResponse(BaseModel): | ||
doi: str | ||
doi_url: str | None = None | ||
title: str | None = None | ||
genre: str | None = None | ||
is_paratext: bool | None = None | ||
published_date: str | None = None | ||
year: int | None = None | ||
journal_name: str | None = None | ||
journal_issns: str | None = None | ||
journal_issn_l: str | None = None | ||
journal_is_oa: bool | None = None | ||
journal_is_in_doaj: bool | None = None | ||
publisher: str | None = None | ||
is_oa: bool | ||
oa_status: str | None = None | ||
has_repository_copy: bool | None = None | ||
best_oa_location: BestOaLocation | None = None | ||
updated: datetime | None = None | ||
z_authors: list[Author] | None = None | ||
|
||
|
||
class SearchResponse(BaseModel): | ||
response: UnpaywallResponse | ||
score: float | ||
snippet: str | ||
|
||
|
||
class SearchResults(BaseModel): | ||
results: list[SearchResponse] | ||
elapsed_seconds: float | ||
|
||
|
||
class UnpaywallProvider(DOIOrTitleBasedProvider): | ||
|
||
async def get_doc_details( | ||
self, doi: str, session: aiohttp.ClientSession | ||
) -> DocDetails: | ||
|
||
try: | ||
results = UnpaywallResponse( | ||
**( | ||
await _get_with_retrying( | ||
url=f"{UNPAYWALL_BASE_URL}{doi}?email={os.environ.get("UNPAYWALL_EMAIL", "[email protected]")}", | ||
params={}, | ||
session=session, | ||
timeout=UNPAYWALL_TIMEOUT, | ||
http_exception_mappings={ | ||
HTTPStatus.NOT_FOUND: DOINotFoundError( | ||
f"Unpaywall not find DOI for {doi}." | ||
) | ||
}, | ||
) | ||
) | ||
) | ||
except ValidationError as e: | ||
raise DOINotFoundError( | ||
f"Unpaywall results returned with a bad schema for DOI {doi!r}." | ||
) from e | ||
|
||
return self._create_doc_details(results) | ||
|
||
@staticmethod | ||
def clean_query(query: str) -> str: | ||
"""Toss out any word in the query that has a forbidden character.""" | ||
return query | ||
|
||
async def search_by_title( | ||
self, | ||
query: str, | ||
session: aiohttp.ClientSession, | ||
title_similarity_threshold: float = 0.75, | ||
) -> DocDetails: | ||
try: | ||
results = SearchResults( | ||
**( | ||
await _get_with_retrying( | ||
url=( | ||
f"{UNPAYWALL_BASE_URL}search?query={quote(self.clean_query(query))}" | ||
'&email={os.environ.get("UNPAYWALL_EMAIL", "[email protected]")}' | ||
), | ||
params={}, | ||
session=session, | ||
timeout=UNPAYWALL_TIMEOUT, | ||
http_exception_mappings={ | ||
HTTPStatus.NOT_FOUND: DOINotFoundError( | ||
f"Could not find DOI for {query}." | ||
) | ||
}, | ||
) | ||
) | ||
).results | ||
except ValidationError as e: | ||
raise DOINotFoundError( | ||
f"Unpaywall results returned with a bad schema for title {query!r}." | ||
) from e | ||
|
||
if not results: | ||
raise DOINotFoundError( | ||
f"Unpaywall results did not match for title {query!r}." | ||
) | ||
|
||
details = self._create_doc_details(results[0].response) | ||
|
||
if ( | ||
strings_similarity( | ||
details.title or "", | ||
query, | ||
) | ||
< title_similarity_threshold | ||
): | ||
raise DOINotFoundError( | ||
f"Unpaywall results did not match for title {query!r}." | ||
) | ||
return details | ||
|
||
def _create_doc_details(self, data: UnpaywallResponse) -> DocDetails: | ||
return DocDetails( # type: ignore[call-arg] | ||
authors=[ | ||
f"{author.given} {author.family}" for author in (data.z_authors or []) | ||
], | ||
publication_date=( | ||
None | ||
if not data.published_date | ||
else datetime.strptime(data.published_date, "%Y-%m-%d") | ||
), | ||
year=data.year, | ||
journal=data.journal_name, | ||
publisher=data.publisher, | ||
url=None if not data.best_oa_location else data.best_oa_location.url, | ||
title=data.title, | ||
doi=data.doi, | ||
doi_url=data.doi_url, | ||
other={ | ||
"genre": data.genre, | ||
"is_paratext": data.is_paratext, | ||
"journal_issns": data.journal_issns, | ||
"journal_issn_l": data.journal_issn_l, | ||
"journal_is_oa": data.journal_is_oa, | ||
"journal_is_in_doaj": data.journal_is_in_doaj, | ||
"is_oa": data.is_oa, | ||
"oa_status": data.oa_status, | ||
"has_repository_copy": data.has_repository_copy, | ||
"best_oa_location": ( | ||
None | ||
if not data.best_oa_location | ||
else data.best_oa_location.model_dump() | ||
), | ||
}, | ||
) | ||
|
||
async def _query(self, query: TitleAuthorQuery | DOIQuery) -> DocDetails | None: | ||
if isinstance(query, DOIQuery): | ||
return await self.get_doc_details(doi=query.doi, session=query.session) | ||
return await self.search_by_title( | ||
query=query.title, | ||
session=query.session, | ||
title_similarity_threshold=query.title_similarity_threshold, | ||
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.