Skip to content

Commit

Permalink
chore: update for dataset rewrite (#66)
Browse files Browse the repository at this point in the history
Signed-off-by: Grant Linville <[email protected]>
  • Loading branch information
g-linville authored Nov 7, 2024
1 parent e9f3b2f commit 0cebee3
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 124 deletions.
28 changes: 11 additions & 17 deletions gptscript/datasets.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
import base64
from typing import Dict
from pydantic import BaseModel, field_serializer, field_validator, BeforeValidator


class DatasetMeta(BaseModel):
id: str
name: str
description: str


class DatasetElementMeta(BaseModel):
name: str
description: str


class DatasetElement(BaseModel):
name: str
description: str
contents: bytes
description: str = ""
contents: str = ""
binaryContents: bytes = b""

@field_serializer("contents")
@field_serializer("binaryContents")
def serialize_contents(self, value: bytes) -> str:
return base64.b64encode(value).decode("utf-8")

@field_validator("contents", mode="before")
@field_validator("binaryContents", mode="before")
def deserialize_contents(cls, value) -> bytes:
if isinstance(value, str):
return base64.b64decode(value)
return value


class DatasetMeta(BaseModel):
id: str
name: str
description: str


class Dataset(BaseModel):
id: str
name: str
description: str
elements: Dict[str, DatasetElementMeta]
99 changes: 25 additions & 74 deletions gptscript/gptscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from gptscript.confirm import AuthResponse
from gptscript.credentials import Credential, to_credential
from gptscript.datasets import DatasetMeta, Dataset, DatasetElementMeta, DatasetElement
from gptscript.datasets import DatasetElementMeta, DatasetElement, DatasetMeta
from gptscript.fileinfo import FileInfo
from gptscript.frame import RunFrame, CallFrame, PromptFrame, Program
from gptscript.opts import GlobalOptions
Expand Down Expand Up @@ -213,109 +213,58 @@ async def delete_credential(self, context: str = "default", name: str = "") -> s
{"context": [context], "name": name}
)

async def list_datasets(self, workspace_id: str) -> List[DatasetMeta]:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]

# list_datasets returns an array of dataset IDs
async def list_datasets(self) -> List[DatasetMeta]:
res = await self._run_basic_command(
"datasets",
{"input": "{}", "workspaceID": workspace_id, "datasetToolRepo": self.opts.DatasetToolRepo,
"env": self.opts.Env}
)
return [DatasetMeta.model_validate(d) for d in json.loads(res)]

async def create_dataset(self, workspace_id: str, name: str, description: str = "") -> Dataset:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]

if name == "":
raise ValueError("name cannot be empty")

res = await self._run_basic_command(
"datasets/create",
{
"input": json.dumps({"datasetName": name, "datasetDescription": description}),
"workspaceID": workspace_id,
"datasetToolRepo": self.opts.DatasetToolRepo,
"env": self.opts.Env,
}
)
return Dataset.model_validate_json(res)

async def add_dataset_element(self, workspace_id: str, datasetID: str, elementName: str, elementContent: bytes,
elementDescription: str = "") -> DatasetElementMeta:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]

if datasetID == "":
raise ValueError("datasetID cannot be empty")
elif elementName == "":
raise ValueError("elementName cannot be empty")
elif not elementContent:
raise ValueError("elementContent cannot be empty")

res = await self._run_basic_command(
"datasets/add-element",
{
"input": json.dumps({
"datasetID": datasetID,
"elementName": elementName,
"elementContent": base64.b64encode(elementContent).decode("utf-8"),
"elementDescription": elementDescription,
}),
"workspaceID": workspace_id,
"datasetToolRepo": self.opts.DatasetToolRepo,
"input": "{}",
"datasetTool": self.opts.DatasetTool,
"env": self.opts.Env
}
)
return DatasetElementMeta.model_validate_json(res)

async def add_dataset_elements(self, workspace_id: str, datasetID: str, elements: List[DatasetElement]) -> str:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]
return [DatasetMeta.model_validate(d) for d in json.loads(res)]

if datasetID == "":
raise ValueError("datasetID cannot be empty")
elif not elements:
async def add_dataset_elements(
self,
elements: List[DatasetElement],
datasetID: str = "",
name: str = "",
description: str = ""
) -> str:
if not elements:
raise ValueError("elements cannot be empty")

res = await self._run_basic_command(
"datasets/add-elements",
{
"input": json.dumps({
"datasetID": datasetID,
"name": name,
"description": description,
"elements": [element.model_dump() for element in elements],
}),
"workspaceID": workspace_id,
"datasetToolRepo": self.opts.DatasetToolRepo,
"datasetTool": self.opts.DatasetTool,
"env": self.opts.Env
}
)
return res


async def list_dataset_elements(self, workspace_id: str, datasetID: str) -> List[DatasetElementMeta]:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]

async def list_dataset_elements(self, datasetID: str) -> List[DatasetElementMeta]:
if datasetID == "":
raise ValueError("datasetID cannot be empty")

res = await self._run_basic_command(
"datasets/list-elements",
{
"input": json.dumps({"datasetID": datasetID}),
"workspaceID": workspace_id,
"datasetToolRepo": self.opts.DatasetToolRepo,
"datasetTool": self.opts.DatasetTool,
"env": self.opts.Env
}
)
return [DatasetElementMeta.model_validate(d) for d in json.loads(res)]

async def get_dataset_element(self, workspace_id: str, datasetID: str, elementName: str) -> DatasetElement:
if workspace_id == "":
workspace_id = os.environ["GPTSCRIPT_WORKSPACE_ID"]

async def get_dataset_element(self, datasetID: str, elementName: str) -> DatasetElement:
if datasetID == "":
raise ValueError("datasetID cannot be empty")
elif elementName == "":
Expand All @@ -324,9 +273,11 @@ async def get_dataset_element(self, workspace_id: str, datasetID: str, elementNa
res = await self._run_basic_command(
"datasets/get-element",
{
"input": json.dumps({"datasetID": datasetID, "element": elementName}),
"workspaceID": workspace_id,
"datasetToolRepo": self.opts.DatasetToolRepo,
"input": json.dumps({
"datasetID": datasetID,
"name": elementName,
}),
"datasetTool": self.opts.DatasetTool,
"env": self.opts.Env,
}
)
Expand Down
6 changes: 3 additions & 3 deletions gptscript/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(
defaultModelProvider: str = "",
defaultModel: str = "",
cacheDir: str = "",
datasetToolRepo: str = "",
datasetTool: str = "",
workspaceTool: str = "",
env: list[str] = None,
):
Expand All @@ -23,7 +23,7 @@ def __init__(
self.DefaultModel = defaultModel
self.DefaultModelProvider = defaultModelProvider
self.CacheDir = cacheDir
self.DatasetToolRepo = datasetToolRepo
self.DatasetTool = datasetTool
self.WorkspaceTool = workspaceTool
if env is None:
env = [f"{k}={v}" for k, v in os.environ.items()]
Expand All @@ -42,7 +42,7 @@ def merge(self, other: Self) -> Self:
cp.DefaultModel = other.DefaultModel if other.DefaultModel != "" else self.DefaultModel
cp.DefaultModelProvider = other.DefaultModelProvider if other.DefaultModelProvider != "" else self.DefaultModelProvider
cp.CacheDir = other.CacheDir if other.CacheDir != "" else self.CacheDir
cp.DatasetToolRepo = other.DatasetToolRepo if other.DatasetToolRepo != "" else self.DatasetToolRepo
cp.DatasetTool = other.DatasetTool if other.DatasetTool != "" else self.DatasetTool
cp.WorkspaceTool = other.WorkspaceTool if other.WorkspaceTool != "" else self.WorkspaceTool
cp.Env = (other.Env or [])
cp.Env.extend(self.Env or [])
Expand Down
59 changes: 29 additions & 30 deletions tests/test_gptscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,60 +760,59 @@ async def test_credentials(gptscript):

@pytest.mark.asyncio
async def test_datasets(gptscript):
workspace_id = await gptscript.create_workspace("directory")
dataset_name = str(os.urandom(8).hex())
os.environ["GPTSCRIPT_WORKSPACE_ID"] = await gptscript.create_workspace("directory")

new_client = GPTScript(GlobalOptions(
apiKey=os.getenv("OPENAI_API_KEY"),
env=[f"{k}={v}" for k, v in os.environ.items()],
))

# Create dataset
dataset = await gptscript.create_dataset(workspace_id, dataset_name, "this is a test dataset")
assert dataset.id != "", "Expected dataset id to be set"
assert dataset.name == dataset_name, "Expected dataset name to match"
assert dataset.description == "this is a test dataset", "Expected dataset description to match"
assert len(dataset.elements) == 0, "Expected dataset elements to be empty"

# Add an element
element_meta = await gptscript.add_dataset_element(workspace_id, dataset.id, "element1", b"element1 contents",
"element1 description")
assert element_meta.name == "element1", "Expected element name to match"
assert element_meta.description == "element1 description", "Expected element description to match"
dataset_id = await new_client.add_dataset_elements([
DatasetElement(name="element1", contents="element1 contents", description="element1 description"),
DatasetElement(name="element2", binaryContents=b"element2 contents", description="element2 description"),
], name="test-dataset", description="test dataset description")

# Add two more elements
await gptscript.add_dataset_elements(workspace_id, dataset.id, [
DatasetElement(name="element2", contents=b"element2 contents", description="element2 description"),
DatasetElement(name="element3", contents=b"element3 contents", description="element3 description"),
])
await new_client.add_dataset_elements([
DatasetElement(name="element3", contents="element3 contents", description="element3 description"),
DatasetElement(name="element4", contents="element3 contents", description="element4 description"),
], datasetID=dataset_id)

# Get the elements
e1 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element1")
e1 = await new_client.get_dataset_element(dataset_id, "element1")
assert e1.name == "element1", "Expected element name to match"
assert e1.contents == b"element1 contents", "Expected element contents to match"
assert e1.contents == "element1 contents", "Expected element contents to match"
assert e1.description == "element1 description", "Expected element description to match"
e2 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element2")
e2 = await new_client.get_dataset_element(dataset_id, "element2")
assert e2.name == "element2", "Expected element name to match"
assert e2.contents == b"element2 contents", "Expected element contents to match"
assert e2.binaryContents == b"element2 contents", "Expected element contents to match"
assert e2.description == "element2 description", "Expected element description to match"
e3 = await gptscript.get_dataset_element(workspace_id, dataset.id, "element3")
e3 = await new_client.get_dataset_element(dataset_id, "element3")
assert e3.name == "element3", "Expected element name to match"
assert e3.contents == b"element3 contents", "Expected element contents to match"
assert e3.contents == "element3 contents", "Expected element contents to match"
assert e3.description == "element3 description", "Expected element description to match"

# List elements in the dataset
elements = await gptscript.list_dataset_elements(workspace_id, dataset.id)
assert len(elements) == 3, "Expected one element in the dataset"
elements = await new_client.list_dataset_elements(dataset_id)
assert len(elements) == 4, "Expected four elements in the dataset"
assert elements[0].name == "element1", "Expected element name to match"
assert elements[0].description == "element1 description", "Expected element description to match"
assert elements[1].name == "element2", "Expected element name to match"
assert elements[1].description == "element2 description", "Expected element description to match"
assert elements[2].name == "element3", "Expected element name to match"
assert elements[2].description == "element3 description", "Expected element description to match"
assert elements[3].name == "element4", "Expected element name to match"
assert elements[3].description == "element4 description", "Expected element description to match"

# List datasets
datasets = await gptscript.list_datasets(workspace_id)
datasets = await new_client.list_datasets()
assert len(datasets) > 0, "Expected at least one dataset"
assert datasets[0].id == dataset.id, "Expected dataset id to match"
assert datasets[0].name == dataset_name, "Expected dataset name to match"
assert datasets[0].description == "this is a test dataset", "Expected dataset description to match"
assert datasets[0].id == dataset_id, "Expected dataset id to match"
assert datasets[0].name == "test-dataset", "Expected dataset name to match"
assert datasets[0].description == "test dataset description", "Expected dataset description to match"

await gptscript.delete_workspace(workspace_id)
await gptscript.delete_workspace(os.environ["GPTSCRIPT_WORKSPACE_ID"])


@pytest.mark.asyncio
Expand Down

0 comments on commit 0cebee3

Please sign in to comment.