Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
afc3fb4
Refactor README and Vicinity class to support any serializable item type
davidberenstein1957 Jan 20, 2025
9ffb491
Update README.md to include examples for saving/loading vector stores…
davidberenstein1957 Jan 20, 2025
7b2bb53
Refactor Vicinity class to streamline token handling
davidberenstein1957 Jan 20, 2025
a5ce987
Refactor item handling in tests and Vicinity class
davidberenstein1957 Jan 20, 2025
022c7b1
Apply suggestions from code review
davidberenstein1957 Jan 20, 2025
eaabbfa
Refactor token insertion in Vicinity class to simplify duplicate hand…
davidberenstein1957 Jan 20, 2025
031c136
Refactor token deletion logic in Vicinity class to improve error hand…
davidberenstein1957 Jan 20, 2025
26e7ed6
Enhance error handling in Vicinity class for JSON serialization
davidberenstein1957 Jan 20, 2025
6fb6305
Add non-serializable items fixture and test for Vicinity class
davidberenstein1957 Jan 20, 2025
c86f7e5
Add Hugging Face integration for Vicinity class
davidberenstein1957 Jan 28, 2025
a410686
Merge branch 'MinishLab:main' into add-hub-integration
davidberenstein1957 Jan 28, 2025
4f30d45
Enhance Hugging Face integration with improved error handling and dat…
davidberenstein1957 Feb 25, 2025
cab15e5
Update pyproject.toml and README.md for improved package installation…
davidberenstein1957 Feb 25, 2025
65465f3
Add test for Vicinity.load_from_hub method
davidberenstein1957 Feb 25, 2025
06545dd
Remove test files for utils and vicinity modules
davidberenstein1957 Feb 25, 2025
cc3fbf4
Add comprehensive test suites for Vicinity and utility functions
davidberenstein1957 Feb 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions vicinity/integrations/huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import json
Comment thread
davidberenstein1957 marked this conversation as resolved.
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Any

from huggingface_hub import DatasetCard, upload_file, upload_folder

from vicinity.backends import BasicVectorStore, get_backend_class
from vicinity.datatypes import Backend

if TYPE_CHECKING:
from vicinity.vicinity import Vicinity


class HuggingFaceMixin:
def save_to_hub(
Comment thread
davidberenstein1957 marked this conversation as resolved.
Outdated
self,
repo_id: str,
token: str | None = None,
private: bool = False,
**kwargs: Any,
) -> None:
"""
Save the Vicinity instance to the Hugging Face Hub.

Args:
Comment thread
davidberenstein1957 marked this conversation as resolved.
Outdated
repo_id: The repository ID on the Hugging Face Hub
token: Optional authentication token for private repositories
private: Whether to create a private repository
**kwargs: Additional arguments passed to push_to_hub()

"""
self.push_to_hub(repo_id, token=token, private=private, **kwargs)

def push_to_hub(
self,
repo_id: str,
token: str | None = None,
private: bool = False,
**kwargs: Any,
) -> None:
"""
Push the Vicinity instance to the Hugging Face Hub.

Args:
repo_id: The repository ID on the Hugging Face Hub
token: Optional authentication token for private repositories
private: Whether to create a private repository
**kwargs: Additional arguments passed to Dataset.push_to_hub()

"""
from datasets import Dataset

# Create and push dataset with items and vectors
if isinstance(self.items[0], dict):
dataset_dict = {k: [item[k] for item in self.items] for k in self.items[0].keys()}
else:
dataset_dict = {"items": self.items}
if self.vector_store is not None:
dataset_dict["vectors"] = self.vector_store.vectors
dataset = Dataset.from_dict(dataset_dict)
dataset.push_to_hub(repo_id, token=token, private=private, **kwargs)

# Save backend and config files to temp directory and upload
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)

# Save and upload backend
self.backend.save(temp_path)
upload_folder(
repo_id=repo_id,
folder_path=temp_path,
token=token,
repo_type="dataset",
path_in_repo="backend",
)

# Save and upload config
config = {"metadata": self.metadata, "backend_type": self.backend.backend_type.value}
config_path = temp_path / "config.json"
config_path.write_text(json.dumps(config))
upload_file(
repo_id=repo_id,
path_or_fileobj=config_path,
token=token,
repo_type="dataset",
path_in_repo="config.json",
)

# DatasetCard
DatasetCard(
Comment thread
davidberenstein1957 marked this conversation as resolved.
Outdated
content=(
f"""
---
tags:
- vicinity
- vector-store
---

# Dataset Card for {repo_id}

This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends.

It contains a vector space with {len(self.items)} items.

## Usage

You can load this dataset using the following code:

```python
from vicinity import Vicinity
vicinity = Vicinity.load_from_hub("{repo_id}")
```

After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector.

## Configuration

The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder.

```bash
{json.dumps(config, indent=2)}
```
"""
)
).push_to_hub(repo_id, token=token, repo_type="dataset")

@classmethod
def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity":
"""
Load a Vicinity instance from the Hugging Face Hub.

:param repo_id: The repository ID on the Hugging Face Hub.
:param token: Optional authentication token for private repositories.
:param kwargs: Additional arguments passed to load_dataset.
Comment thread
davidberenstein1957 marked this conversation as resolved.
Outdated
:return: A Vicinity instance loaded from the Hub.
"""
from datasets import load_dataset
from huggingface_hub import snapshot_download

# Load dataset and extract items and vectors
dataset = load_dataset(repo_id, token=token, split="train", **kwargs)
if "items" in dataset.column_names:
items = dataset["items"]
else:
# Create items from all columns except 'vectors'
items = []
columns = [col for col in dataset.column_names if col != "vectors"]
for i in range(len(dataset)):
items.append({col: dataset[col][i] for col in columns})
has_vectors = "vectors" in dataset.column_names
vector_store = BasicVectorStore(vectors=dataset["vectors"]) if has_vectors else None

# Download and load config and backend
repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset"))
with open(repo_path / "config.json") as f:
config = json.load(f)

backend_type = Backend(config["backend_type"])
backend = get_backend_class(backend_type).load(repo_path / "backend")

return cls(items=items, backend=backend, metadata=config["metadata"], vector_store=vector_store)
8 changes: 7 additions & 1 deletion vicinity/vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import importlib
import logging
from io import open
from pathlib import Path
Expand All @@ -19,8 +20,13 @@

logger = logging.getLogger(__name__)

if importlib.util.find_spec("huggingface_hub") is not None and importlib.util.find_spec("datasets") is not None:
Comment thread
davidberenstein1957 marked this conversation as resolved.
Outdated
from vicinity.integrations.huggingface import HuggingFaceMixin
else:
HuggingFaceMixin = object

class Vicinity:

class Vicinity(HuggingFaceMixin):
"""
Work with vector representations of items.

Expand Down