Skip to content

internal tagging #377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 44 additions & 11 deletions nomic/data_operations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import io
import json
import time
from collections import defaultdict
from datetime import datetime
from pathlib import Path
Expand Down Expand Up @@ -679,11 +680,14 @@ def __init__(self, projection: "AtlasProjection", auto_cleanup: Optional[bool] =
self.auto_cleanup = auto_cleanup

@property
def df(self, overwrite: bool = False) -> pd.DataFrame:
def df(self, overwrite: bool = False, wait_time: int = 120) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid using a @property with parameters. Consider converting df to a regular method so that parameters like overwrite and wait_time can be explicitly passed.

"""
Pandas DataFrame mapping each data point to its tags.

Args:
wait_time: The maximum time to wait while fetching a tag.
"""
tags = self.get_tags()
tags = self.get_tags(wait_time=wait_time)
tag_definition_ids = [tag["tag_definition_id"] for tag in tags]
if self.auto_cleanup:
self._remove_outdated_tag_files(tag_definition_ids)
Expand All @@ -708,11 +712,25 @@ def df(self, overwrite: bool = False) -> pd.DataFrame:
tbs.append(tb)
return pa.concat_tables(tbs).to_pandas()

def get_tags(self) -> List[Dict[str, str]]:
def is_tag_complete(self, tag_id) -> bool:
is_complete = requests.get(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding error handling (e.g. response.raise_for_status) in is_tag_complete to gracefully handle HTTP errors.

self.dataset.atlas_api_path + "/v1/project/projection/tags/status",
headers=self.dataset.header,
params={
"project_id": self.dataset.id,
"tag_id": tag_id,
},
).json()["is_complete"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding error handling for the GET request in is_tag_complete. Currently, it directly calls .json() without checking status codes.

return is_complete

def get_tags(self, wait_time: int = 120) -> List[Dict[str, str]]:
"""
Retrieves back all tags made in the web browser for a specific map.
Each tag is a dictionary containing tag_name, tag_id, and metadata.

Args:
wait_time: The maximum time to wait for a tag to be completed.

Returns:
A list of tags a user has created for projection.
"""
Expand All @@ -723,16 +741,31 @@ def get_tags(self) -> List[Dict[str, str]]:
).json()
keep_tags = []
for tag in tags:
is_complete = requests.get(
self.dataset.atlas_api_path + "/v1/project/projection/tags/status",
headers=self.dataset.header,
params={
"project_id": self.dataset.id,
"tag_id": tag["tag_id"],
},
).json()["is_complete"]
tag_id = tag["tag_id"]
is_complete = self.is_tag_complete(tag_id)
if is_complete:
keep_tags.append(tag)
else:
# Use robotag route instead of v1/n so we guarantee only one request gets launched
requests.post(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding error handling for the robotag POST request to ensure it succeeds before proceeding.

self.dataset.atlas_api_path + "/v1/project/projection/tags/robotag",
headers=self.dataset.header,
json={"project_id": self.dataset.id, "tag_id": tag_id},
)
wait_start = time.time()
# Wait up to 5 minutes for tag to be completed
while not is_complete:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an inconsistency in the get_tags method in the AtlasMapTags class. The inline comment mentions sleeping for 5 seconds and waiting up to 5 minutes for a tag to be completed, but the code actually calls time.sleep(15) and uses a default wait_time of 120 seconds. Please update either the code or the comments so that they both reflect the intended behavior.

# Sleep 5 seconds
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment mentions sleeping 5 seconds, but the code uses time.sleep(15). Please adjust the comment or the sleep duration for consistency.

Suggested change
# Sleep 5 seconds
# Sleep 15 seconds

time.sleep(15)
if time.time() >= wait_start + wait_time:
break
is_complete = self.is_tag_complete(tag_id)
if is_complete:
keep_tags.append(tag)
else:
logger.warning(
f"Tag {tag['tag_name']} currently unavailable for download from SDK. Download from {self.projection.dataset_link} instead or try again."
)
return keep_tags

def get_datums_in_tag(self, tag_name: str, overwrite: bool = False):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

setup(
name="nomic",
version="3.4.1",
version="3.4.2",
url="https://github.com/nomic-ai/nomic",
description=description,
long_description=long_description,
Expand Down