Dataset SDK for consistent read/write [batch, online, streaming] data.
Project description
Welcome to @datasets
TODO
import pandas as pd
from metaflow import FlowSpec, Parameter, current, step
from datasets import DatasetType, Mode
# Can also invoke from CLI:
# > python datasets/tutorials/0_hello_dataset_flow.py run \
# --hello_dataset '{"name": "foo", "partition_by": "region", "mode": "Write"}'
class HelloDatasetFlow(FlowSpec):
hello_dataset = Parameter(
"hello_dataset",
default=dict(name="HelloDataset", partition_by="region", mode=Mode.Write),
type=DatasetType,
)
@step
def start(self):
df = pd.DataFrame({"region": ["A", "A", "A", "B", "B", "B"], "zpid": [1, 2, 3, 4, 5, 6]})
print("saving df: \n", df.to_string(index=False))
# Example of writing to a dataset
print(f"{self.hello_dataset.program_name=}")
self.hello_dataset.write(df)
self.next(self.end)
@step
def end(self):
print(f"I have dataset \n{self.hello_dataset=}")
# hello_dataset to_pandas()
df: pd.DataFrame = self.hello_dataset.to_pandas(run_id=current.run_id)
print("self.hello_dataset.to_pandas():\n", df.to_string(index=False))
# save this as an output dataset
self.output_dataset = self.hello_dataset
if __name__ == "__main__":
HelloDatasetFlow()
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
zdatasets-0.0.10.tar.gz
(47.7 kB
view hashes)
Built Distribution
zdatasets-0.0.10-py3-none-any.whl
(75.1 kB
view hashes)
Close
Hashes for zdatasets-0.0.10-py3-none-any.whl
Algorithm | Hash digest | |
---|---|---|
SHA256 | 222ce99db0feb30162e0b549fe85edffe016e084b568e8b1fa74c2e859fadff3 |
|
MD5 | 913ec7af9c36244f326576fa012a5469 |
|
BLAKE2b-256 | a83d5ffd15492b3a21f56c4192ed9eba8efe975732c1fc2cafc29bd8c9c62d01 |