π agents_wiki SDK Documentation
The AgentsWiki SDK (wiki_sdk/agents_wiki/
) provides a unified Python interface to:
- ArangoDB β Graph/document database (CRUD, queries, ingestion).
- Embeddings DBs β Weaviate & Pinecone (vector CRUD, search, index management).
- Arango β Embeddings Ingestor β Pipelines that embed Arango documents into vector databases using pluggable models.
βοΈ Installation
Clone the repo and install in editable mode:
git clone https://github.com/your-org/agents_wiki.git
cd agents_wiki/wiki_sdk
pip install -e .
Optional extras
# For S3 ingestion
pip install agents_wiki[s3]
# For embedding models (Hugging Face / Sentence-Transformers, PyTorch)
pip install agents_wiki[ml]
# For development tools
pip install agents_wiki[dev]
π₯ Importing
After installation, you can import the SDK modules like this:
# ArangoDB SDK
from agents_wiki.db import ArangoSDK
# Embeddings SDK (Weaviate & Pinecone)
from agents_wiki.embeddings import EmbeddingsSDK
# Arango β Embeddings ingestion pipeline
from agents_wiki.embeddings_store import ArangoToEmbeddingsIngestor, EmbeddingModel
π ArangoDB Usage & Ingestion
The ArangoSDK (agents_wiki.db
) is a wrapper around the official python-arango
client, exposing a simpler API for:
- Database & collection lifecycle
- CRUD operations
- AQL queries
- Ingestion from JSON/JSONL (local & S3)
π Connecting to ArangoDB
from agents_wiki.db import ArangoSDK
# Connect to Arango
arango = ArangoSDK(
endpoint="http://arangodb.databases.svc.cluster.local:8529",
db_name="_system",
username="root",
password="changeme"
)
π Database Lifecycle
# List all databases
print(arango.list_databases())
# Create a new database with replication
arango.create_database(
"knowledge",
options={"replicationFactor": 2, "writeConcern": 2}
)
# Switch active DB
arango.switch_db("knowledge", username="root", password="changeme")
# Delete a database
arango.delete_database("knowledge")
π Collection Lifecycle
# Check if collection exists
print(arango.has_collection("agents"))
# Create collection with sharding & replication
arango.create_collection(
"agents",
replication_factor=2,
number_of_shards=6,
write_concern=2,
shard_keys=["_key"]
)
# Delete collection
arango.delete_collection("agents")
βοΈ CRUD Operations
# Insert a document
arango.insert_one("agents", {"_key": "a1", "name": "Alice", "role": "analyst"})
# Get document by key
print(arango.get_one("agents", "a1"))
# Update document
arango.update_one("agents", {"_key": "a1", "role": "senior analyst"})
# Replace document
arango.replace_one("agents", {"_key": "a1", "name": "Alice", "role": "lead"})
# Delete document
arango.delete_one("agents", "a1")
π AQL Queries
# Simple AQL query
results = arango.aql("FOR doc IN agents FILTER doc.role == @role RETURN doc", bind_vars={"role": "lead"})
for r in results:
print(r)
π₯ Ingestion from Local JSON / JSONL
Supports array JSON ([ {...}, {...} ]
) and JSON Lines ({...}\n{...}
).
# Insert data from file
stats = arango.ingest_from_file("agents", "data/agents.jsonl", upsert_on="_key")
print(stats) # {"processed": 5000, "method": "upsert"}
βοΈ Ingestion from S3
# Ingest directly from S3
stats = arango.ingest_from_s3(
collection="agents",
bucket="my-bucket",
key="exports/agents.jsonl",
upsert_on="_key",
batch_size=1000
)
print(stats)
β‘ Bulk Import (High Throughput)
For very large datasets, use Arangoβs HTTP bulk importer:
stats = arango.http_bulk_import(
collection="agents",
jsonl_path="data/agents.jsonl",
on_duplicate="update" # options: error | update | replace | ignore
)
print(stats) # {"created": 95000, "updated": 5000, "errors": 0}
π Management Commands
print(arango.server_status()) # Arango server info
print(arango.server_role()) # SINGLE, COORDINATOR, or DBSERVER
print(arango.cluster_health()) # Cluster health (if cluster mode)
arango.routing_reload() # Reload routing information
π Embeddings Database Usage
The EmbeddingsSDK (agents_wiki.embeddings
) provides a unified interface to work with vector databases:
- Weaviate (self-hosted, Kubernetes-native)
- Pinecone (managed SaaS, serverless indexes)
It exposes APIs for:
- Index/collection lifecycle
- Vector CRUD (insert, fetch, update, delete)
- Vector search with filters
- Data ingestion from JSON/JSONL (local or S3)
π Connecting
Weaviate
from agents_wiki.embeddings import EmbeddingsSDK
weav = EmbeddingsSDK(
provider="weaviate",
weaviate_url="http://weaviate.vector-db.svc.cluster.local:8080"
)
Pinecone
pine = EmbeddingsSDK(
provider="pinecone",
pinecone_api_key="YOUR_PINECONE_API_KEY",
pinecone_index_name="agents-index",
dimension=1536, # must match your embedding model
metric="cosine",
pinecone_cloud="aws",
pinecone_region="us-east-1"
)
π Collection / Index Lifecycle
# Create collection (Weaviate)
weav.create_collection("AgentVectors")
# Create index (Pinecone)
pine.create_collection("agents-index")
# List all collections
print(weav.list_collections())
print(pine.list_collections())
# Delete collection/index
weav.delete_collection("AgentVectors")
pine.delete_collection("agents-index")
βοΈ CRUD Operations
# Insert vectors
vectors = [
{"id": "a1", "values": [0.1, 0.2, 0.3], "metadata": {"name": "Alice"}},
{"id": "a2", "values": [0.4, 0.5, 0.6], "metadata": {"name": "Bob"}}
]
weav.insert_vectors("AgentVectors", vectors)
pine.insert_vectors("agents-index", vectors)
# Fetch by ID
print(weav.get_vector("AgentVectors", "a1"))
print(pine.get_vector("agents-index", "a1"))
# Update vector
weav.update_vector("AgentVectors", "a1", [0.9, 0.8, 0.7], {"name": "Alice Updated"})
pine.update_vector("agents-index", "a1", [0.9, 0.8, 0.7], {"name": "Alice Updated"})
# Delete vector
weav.delete_vector("AgentVectors", "a2")
pine.delete_vector("agents-index", "a2")
π Vector Search
# Search Weaviate (top-3 nearest neighbors)
res = weav.search("AgentVectors", [0.1, 0.2, 0.3], top_k=3)
print(res)
# Search Pinecone with filter
res = pine.search(
"agents-index",
query_vector=[0.5] * 1536,
top_k=5,
filters={"name": {"$eq": "Alice"}}
)
print(res)
π₯ Ingestion from JSON / JSONL
Supports array JSON ([ {...}, {...} ]
) and JSON Lines ({...}\n{...}
).
# Bulk ingest from file
weav.ingest_from_file("AgentVectors", "data/vectors.jsonl")
pine.ingest_from_file("agents-index", "data/vectors.jsonl")
βοΈ Ingestion from S3
weav.ingest_from_s3("AgentVectors", bucket="my-bucket", key="exports/vectors.jsonl")
pine.ingest_from_s3("agents-index", bucket="my-bucket", key="exports/vectors.jsonl")
π Arango β Embeddings Ingestion
The ArangoToEmbeddingsIngestor (agents_wiki.embeddings_store
) provides a streaming pipeline that:
- Pulls documents from ArangoDB (via AQL queries).
- Builds text (for embedding) + metadata (to store in the vector DB).
- Runs texts through a pluggable embedding model (
EmbeddingModel
). - Upserts the resulting vectors into the Embeddings DB (Weaviate or Pinecone).
This enables dual storage:
- ArangoDB keeps structured knowledge.
- Embeddings DB enables semantic search.
π Key Concepts
-
EmbeddingModel (abstract) You must implement
embed_texts(texts: List[str]) -> List[List[float]]
. This allows you to use any embedding backend (OpenAI, Hugging Face, Sentence-Transformers, custom ML). -
Text Builder Function that extracts the text from a document (default: concatenates primitive fields).
-
Metadata Builder Function that builds metadata dictionary to be stored alongside vectors.
-
ID Getter Function that assigns a stable ID (default:
_key
from Arango).
π§ Example: Dummy Embedding Model
from agents_wiki.embeddings_store import EmbeddingModel, ArangoToEmbeddingsIngestor
from agents_wiki.db import ArangoSDK
from agents_wiki.embeddings import EmbeddingsSDK
# Simple toy embedding model
class DummyModel(EmbeddingModel):
@property
def dimension(self) -> int:
return 32
def embed_texts(self, texts):
return [[hash(t) % 100 / 100.0 for _ in range(self.dimension)] for t in texts]
# Connect Arango
arango = ArangoSDK(
endpoint="http://arangodb:8529",
db_name="knowledge",
username="root",
password="secret"
)
# Connect Weaviate
weav = EmbeddingsSDK(
provider="weaviate",
weaviate_url="http://weaviate:8080"
)
# Ensure collection exists in Weaviate
if "AgentVectors" not in weav.list_collections():
weav.create_collection("AgentVectors")
βΆοΈ Running the Ingestor
ingestor = ArangoToEmbeddingsIngestor(
arango=arango,
embeddings=weav,
source_collection="agents",
target_collection="AgentVectors",
model=DummyModel(),
projection_fields=["name", "bio", "role"], # fields to concatenate as text
metadata_builder=lambda d: {"name": d.get("name"), "role": d.get("role")},
batch_size=100,
normalize=True # normalize embeddings for cosine similarity
)
stats = ingestor.run(limit=1000)
print(stats) # {"seen": 1000, "embedded": 950, "upserted": 950}
π€ Example: Hugging Face / Sentence-Transformers Model
from sentence_transformers import SentenceTransformer
class SBERTModel(EmbeddingModel):
def __init__(self, model_name="all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
self._dim = self.model.get_sentence_embedding_dimension()
@property
def dimension(self) -> int:
return self._dim
def embed_texts(self, texts):
embeddings = self.model.encode(texts, batch_size=64, show_progress_bar=False)
return embeddings.tolist()
# Use SBERT as embedding model
ingestor = ArangoToEmbeddingsIngestor(
arango=arango,
embeddings=weav,
source_collection="articles",
target_collection="ArticleVectors",
model=SBERTModel(),
projection_fields=["title", "summary"],
text_builder=lambda d: f"{d.get('title','')} {d.get('summary','')}"
)
print(ingestor.run(limit=5000))