Skip to content

Updates to retraction status checker #370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0c8ebca
initial_commits
geemi725 Sep 4, 2024
bd24406
first commit - retraction scripts
geemi725 Sep 5, 2024
9381296
Merge branch 'september-2024-release' of https://github.com/Future-Ho…
geemi725 Sep 5, 2024
26cdad5
removed: pandas depdencency, retractions.csv
geemi725 Sep 7, 2024
f3ee3d1
Merge branch 'september-2024-release' of https://github.com/Future-Ho…
geemi725 Sep 9, 2024
e7b1441
not recording anymore
geemi725 Sep 9, 2024
d9edda0
relative import ..types -> paperqa.types
geemi725 Sep 10, 2024
71a1623
remove RetrationDataPostProcessor as a default client
geemi725 Sep 10, 2024
5fb3b71
test commit: remove RetrationDataPostProcessor from ALL_CLIENTS
geemi725 Sep 10, 2024
ccb984f
Added: formatted citation, tenancity retry
geemi725 Sep 10, 2024
3b38f25
removed: tqdm, added: citation deets to formatted_citation
geemi725 Sep 11, 2024
f6199d3
Merge branch 'september-2024-release' of https://github.com/Future-Ho…
geemi725 Sep 11, 2024
84b22d6
Check if citation is none, download method moved to crossref.py
geemi725 Sep 11, 2024
4612797
crossref mailto made modular
geemi725 Sep 11, 2024
4024778
moved all to one gitinore file
geemi725 Sep 11, 2024
e3613aa
Merge branch 'main' of https://github.com/Future-House/paper-qa into …
geemi725 Sep 11, 2024
a6925dd
fix: W0621
geemi725 Sep 11, 2024
06cfc99
Update paperqa/clients/crossref.py
geemi725 Sep 11, 2024
8eab028
crossref_mailto() -> get_crossref_mailto()
geemi725 Sep 11, 2024
2b6f7ca
Merge branch 'main' into issue-366
whitead Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -303,3 +303,8 @@ cython_debug/
tests/*txt
tests/*html
tests/test_index/*
tests/example.*
tests/example2.*

# Client data
paperqa/clients/client_data/retractions.csv
62 changes: 56 additions & 6 deletions paperqa/clients/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from urllib.parse import quote

import aiohttp
from anyio import open_file
from tenacity import retry, stop_after_attempt, wait_exponential

from paperqa.types import CITATION_FALLBACK_DATA, DocDetails
from paperqa.utils import (
Expand Down Expand Up @@ -104,6 +106,20 @@ def crossref_headers() -> dict[str, str]:
return {}


def get_crossref_mailto() -> str:
"""Crossref mailto if available, otherwise a default."""
MAILTO = os.getenv("CROSSREF_MAILTO")

if not MAILTO:
logger.warning(
"CROSSREF_MAILTO environment variable not set. Crossref API rate limits may"
" apply."
)
return "[email protected]"

return MAILTO


async def doi_to_bibtex(
doi: str,
session: aiohttp.ClientSession,
Expand Down Expand Up @@ -251,12 +267,7 @@ async def get_doc_details_from_crossref( # noqa: PLR0912

inputs_msg = f"DOI {doi}" if doi is not None else f"title {title}"

if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")):
logger.warning(
"CROSSREF_MAILTO environment variable not set. Crossref API rate limits may"
" apply."
)
CROSSREF_MAILTO = "[email protected]"
CROSSREF_MAILTO = get_crossref_mailto()
quoted_doi = f"/{quote(doi, safe='')}" if doi else ""
url = f"{CROSSREF_BASE_URL}/works{quoted_doi}"
params = {"mailto": CROSSREF_MAILTO}
Expand Down Expand Up @@ -335,6 +346,45 @@ async def get_doc_details_from_crossref( # noqa: PLR0912
return await parse_crossref_to_doc_details(message, session, query_bibtex)


@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=5, min=5),
reraise=True,
)
async def download_retracted_dataset(
retraction_data_path: os.PathLike | str,
) -> None:
"""
Download the retraction dataset from Crossref.

Saves the retraction dataset to `retraction_data_path`.
"""
url = f"https://api.labs.crossref.org/data/retractionwatch?{get_crossref_mailto()}"

async with (
aiohttp.ClientSession() as session,
session.get(
url,
timeout=aiohttp.ClientTimeout(total=300),
) as response,
):
response.raise_for_status()

logger.info(
f"Retraction data was not cashed. Downloading retraction data from {url}..."
)

async with await open_file(str(retraction_data_path), "wb") as f:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
await f.write(chunk)

if os.path.getsize(str(retraction_data_path)) == 0:
raise RuntimeError("Retraction data is empty")


class CrossrefProvider(DOIOrTitleBasedProvider):
async def _query(self, query: TitleAuthorQuery | DOIQuery) -> DocDetails | None:
if isinstance(query, DOIQuery):
Expand Down
40 changes: 2 additions & 38 deletions paperqa/clients/retractions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
import logging
import os

import aiohttp
from anyio import open_file
from pydantic import ValidationError
from tenacity import retry, stop_after_attempt, wait_exponential

from paperqa.types import DocDetails

from .client_models import DOIQuery, MetadataPostProcessor
from .crossref import download_retracted_dataset

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -52,40 +50,6 @@ def _has_cache_expired(self) -> bool:
def _is_csv_cached(self) -> bool:
return os.path.exists(self.retraction_data_path)

@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=5, min=5),
reraise=True,
)
async def _download_retracted_dataset(self) -> None:

if not (CROSSREF_MAILTO := os.getenv("CROSSREF_MAILTO")):
CROSSREF_MAILTO = "[email protected]"
url = f"https://api.labs.crossref.org/data/retractionwatch?{CROSSREF_MAILTO}"

async with (
aiohttp.ClientSession() as session,
session.get(
url,
timeout=aiohttp.ClientTimeout(total=300),
) as response,
):
response.raise_for_status()

logger.info(
f"Retraction data was not cashed. Downloading retraction data from {url}..."
)

async with await open_file(self.retraction_data_path, "wb") as f:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
await f.write(chunk)

if os.path.getsize(self.retraction_data_path) == 0:
raise RuntimeError("Retraction data is empty")

def _filter_dois(self) -> None:
with open(self.retraction_data_path, newline="", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
Expand All @@ -96,7 +60,7 @@ def _filter_dois(self) -> None:

async def load_data(self) -> None:
if not self._is_csv_cached() or self._has_cache_expired():
await self._download_retracted_dataset()
await download_retracted_dataset(self.retraction_data_path)

self._filter_dois()

Expand Down
11 changes: 10 additions & 1 deletion paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ class DocDetails(Doc):
" quality and None means it needs to be hydrated."
),
)

is_retracted: bool | None = Field(
default=None, description="Flag for whether the paper is retracted."
)
Expand Down Expand Up @@ -550,7 +551,14 @@ def __getitem__(self, item: str):
def formatted_citation(self) -> str:

if self.is_retracted:
return f"**RETRACTED ARTICLE** Citation: {self.citation} Retrieved from http://retractiondatabase.org/."
base_message = "**RETRACTED ARTICLE**"
retract_info = "Retrieved from http://retractiondatabase.org/."
citation_message = (
f"Citation: {self.citation}"
if self.citation
else f"Original DOI: {self.doi}"
)
return f"{base_message} {citation_message} {retract_info}"

if (
self.citation is None # type: ignore[redundant-expr]
Expand All @@ -561,6 +569,7 @@ def formatted_citation(self) -> str:
"Citation, citationCount, and sourceQuality are not set -- do you need"
" to call `hydrate`?"
)

quality = (
SOURCE_QUALITY_MESSAGES[self.source_quality]
if self.source_quality >= 0
Expand Down
3 changes: 0 additions & 3 deletions tests/.gitignore

This file was deleted.