Introduction
This cookbook provides quick recipes for common ducklake operations. Each recipe is a self-contained example you can adapt for your workflow.
For a comprehensive real-world example, see the clinical trial data lake vignette.
Setup recipes
Create a new data lake
# Create a data lake in a specific directory
attach_ducklake("my_lake", lake_path = vignette_temp_dir)Attach to an existing data lake
# Attach to an existing lake (creates it if it doesn't exist)
attach_ducklake("existing_lake", lake_path = "/path/to/data_lake")Detach from a data lake
# Detach when done (doesn't delete the lake)
detach_ducklake("my_lake")Loading data recipes
Load data from a data.frame
with_transaction(
create_table(mtcars, "cars"),
author = "Data Engineer",
commit_message = "Initial car data load"
)
#> Transaction started
#> Transaction committed
#> Snapshot metadata updatedUpdate an existing table
# Create a second version of the cars table
with_transaction(
get_ducklake_table("cars") |>
mutate(kpl = mpg * 0.425144) |> # Add km/L conversion
replace_table("cars"),
author = "Data Engineer",
commit_message = "Add km/L metric to cars table"
)
#> Transaction started
#> Transaction committed
#> Snapshot metadata updatedLoad data from a CSV file
# First write a sample CSV (in practice, you'd have an existing file)
csv_path <- file.path(vignette_temp_dir, "sample_data.csv")
write.csv(head(iris, 20), csv_path, row.names = FALSE)
# Load the CSV into the data lake
with_transaction(
create_table(csv_path, "iris_sample"),
author = "Data Engineer",
commit_message = "Load iris sample from CSV"
)
#> Transaction started
#> Transaction committed
#> Snapshot metadata updatedLoad data from a URL
# ducklake can load data directly from URLs
with_transaction(
create_table("https://example.com/data.csv", "remote_data"),
author = "Data Engineer",
commit_message = "Load remote dataset"
)Load with a dplyr pipeline
with_transaction(
mtcars |>
filter(mpg > 20) |>
create_table("efficient_cars"),
author = "Data Analyst",
commit_message = "Load filtered car data"
)
#> Transaction started
#> Transaction committed
#> Snapshot metadata updatedList all tables in the lake
# See what tables exist in your lake
get_ducklake_table("duckdb_tables") |>
filter(schema_name == "main") |>
select(table_name) |>
collect() |>
print(n = Inf)
#> # A tibble: 25 × 1
#> table_name
#> <chr>
#> 1 ducklake_column
#> 2 ducklake_column_mapping
#> 3 ducklake_column_tag
#> 4 ducklake_data_file
#> 5 ducklake_delete_file
#> 6 ducklake_files_scheduled_for_deletion
#> 7 ducklake_file_column_stats
#> 8 ducklake_file_partition_value
#> 9 ducklake_inlined_data_tables
#> 10 ducklake_metadata
#> 11 ducklake_name_mapping
#> 12 ducklake_partition_column
#> 13 ducklake_partition_info
#> 14 ducklake_schema
#> 15 ducklake_schema_versions
#> 16 ducklake_snapshot
#> 17 ducklake_snapshot_changes
#> 18 ducklake_table
#> 19 ducklake_table_column_stats
#> 20 ducklake_table_stats
#> 21 ducklake_tag
#> 22 ducklake_view
#> 23 efficient_cars
#> 24 iris_sample
#> 25 carsReading data recipes
Read a table
# Returns a lazy dplyr tbl
cars_data <- get_ducklake_table("cars")
# Use dplyr verbs
cars_data |>
filter(cyl == 6) |>
select(mpg, cyl, hp) |>
head(3)
#> # Source: SQL [?? x 3]
#> # Database: DuckDB 1.4.4 [unknown@Linux 6.11.0-1018-azure:R 4.5.2//tmp/RtmpCz68LV/duckplyr/duckplyr1fb377b95736.duckdb]
#> mpg cyl hp
#> <dbl> <dbl> <dbl>
#> 1 21 6 110
#> 2 21 6 110
#> 3 21.4 6 110Collect data into memory
# Fetch all data into a data.frame
cars_df <- get_ducklake_table("cars") |> collect()
head(cars_df, 3)
#> # A tibble: 3 × 12
#> mpg cyl disp hp drat wt qsec vs am gear carb kpl
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 8.93
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 8.93
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 9.69View all versions of a table
# See all snapshots for the cars table
list_table_snapshots("cars")
#> snapshot_id snapshot_time schema_version
#> 2 1 2026-02-09 21:20:08 1
#> 3 2 2026-02-09 21:20:08 2
#> changes
#> 2 tables_created, tables_inserted_into, main.cars, 1
#> 3 tables_created, tables_dropped, tables_inserted_into, main.cars, 1, 2
#> author commit_message commit_extra_info
#> 2 Data Engineer Initial car data load <NA>
#> 3 Data Engineer Add km/L metric to cars table <NA>Read a specific version
# Query data as it existed at snapshot 1
get_ducklake_table_version("cars", version = 1) |>
collect()Read data at a specific timestamp
# Query data as of a specific time
get_ducklake_table_asof("cars", timestamp = "2024-01-15 10:30:00") |>
collect()Updating data recipes
Replace entire table
with_transaction(
get_ducklake_table("cars") |>
mutate(hp_per_cyl = hp / as.numeric(cyl)) |> # Add derived metric
replace_table("cars"),
author = "Data Engineer",
commit_message = "Add horsepower per cylinder metric"
)
#> Transaction started
#> Transaction committed
#> Snapshot metadata updatedNote: For most use cases, use replace_table() to update
tables. This creates clean snapshots and maintains full versioning.
Advanced row-level operations (rows_update,
rows_insert, rows_delete) are available when
you need granular control, but they do not create versioned
snapshots.
Metadata and versioning recipes
List all tables
get_ducklake_table("duckdb_tables") |>
filter(schema_name == "main") |>
select(table_name) |>
collect()
#> # A tibble: 25 × 1
#> table_name
#> <chr>
#> 1 ducklake_column
#> 2 ducklake_column_mapping
#> 3 ducklake_column_tag
#> 4 ducklake_data_file
#> 5 ducklake_delete_file
#> 6 ducklake_files_scheduled_for_deletion
#> 7 ducklake_file_column_stats
#> 8 ducklake_file_partition_value
#> 9 ducklake_inlined_data_tables
#> 10 ducklake_metadata
#> # ℹ 15 more rowsView all snapshots
list_table_snapshots()
#> snapshot_id snapshot_time schema_version
#> 1 0 2026-02-09 21:20:07 0
#> 2 1 2026-02-09 21:20:08 1
#> 3 2 2026-02-09 21:20:08 2
#> 4 3 2026-02-09 21:20:08 3
#> 5 4 2026-02-09 21:20:08 4
#> 6 5 2026-02-09 21:20:09 5
#> changes
#> 1 schemas_created, main
#> 2 tables_created, tables_inserted_into, main.cars, 1
#> 3 tables_created, tables_dropped, tables_inserted_into, main.cars, 1, 2
#> 4 tables_created, tables_inserted_into, main.iris_sample, 3
#> 5 tables_created, tables_inserted_into, main.efficient_cars, 4
#> 6 tables_created, tables_dropped, tables_inserted_into, main.cars, 2, 5
#> author commit_message commit_extra_info
#> 1 <NA> <NA> <NA>
#> 2 Data Engineer Initial car data load <NA>
#> 3 Data Engineer Add km/L metric to cars table <NA>
#> 4 Data Engineer Load iris sample from CSV <NA>
#> 5 Data Analyst Load filtered car data <NA>
#> 6 Data Engineer Add horsepower per cylinder metric <NA>View snapshots for a specific table
list_table_snapshots("cars")Restore a table to a previous version
# Use time travel to read an old version, then replace the current table
with_transaction(
get_ducklake_table_version("cars", version = 1) |>
replace_table("cars"),
author = "Data Engineer",
commit_message = "Restore to version 1"
)
list_table_snapshots("cars")Transaction recipes
Simple transaction
with_transaction(
create_table(my_data, "my_table"),
author = "Your Name",
commit_message = "What changed and why"
)Multi-step transaction
with_transaction({
# All these operations happen atomically
create_table(raw_data, "raw_table")
cleaned <- get_ducklake_table("raw_table") |>
filter(!is.na(key_field)) |>
create_table("clean_table")
get_ducklake_table("clean_table") |>
mutate(derived_field = calculate_something(x)) |>
create_table("analysis_table")
},
author = "Data Engineer",
commit_message = "Full ETL pipeline run"
)Manual transaction control
# For fine-grained control
begin_transaction()
create_table(data1, "table1")
create_table(data2, "table2")
# Commit or rollback
commit_transaction(
author = "Your Name",
commit_message = "Manual transaction commit"
)
# Or if something went wrong:
# rollback_transaction()Query optimization recipes
Preview query without execution
get_ducklake_table("cars") |>
filter(mpg > 25) |>
mutate(efficient = TRUE) |>
show_ducklake_query()Filter early for performance
# Good: Filter before other operations
get_ducklake_table("cars") |>
filter(cyl == 6) |>
mutate(kpl = mpg * 0.425144) |>
head(3)
#> # Source: SQL [?? x 13]
#> # Database: DuckDB 1.4.4 [unknown@Linux 6.11.0-1018-azure:R 4.5.2//tmp/RtmpCz68LV/duckplyr/duckplyr1fb377b95736.duckdb]
#> mpg cyl disp hp drat wt qsec vs am gear carb kpl
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 8.93
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 8.93
#> 3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 9.10
#> # ℹ 1 more variable: hp_per_cyl <dbl>Use specific columns
# Good: Select only needed columns
get_ducklake_table("cars") |>
select(mpg, cyl, hp) |>
filter(mpg > 25)
#> # Source: SQL [?? x 3]
#> # Database: DuckDB 1.4.4 [unknown@Linux 6.11.0-1018-azure:R 4.5.2//tmp/RtmpCz68LV/duckplyr/duckplyr1fb377b95736.duckdb]
#> mpg cyl hp
#> <dbl> <dbl> <dbl>
#> 1 32.4 4 66
#> 2 30.4 4 52
#> 3 33.9 4 65
#> 4 27.3 4 66
#> 5 26 4 91
#> 6 30.4 4 113Cleanup
# Detach from the lake
detach_ducklake("my_lake")See also
- Modifying Tables - Detailed guide to table modification approaches
- Transactions - Advanced transaction patterns
- Time Travel - Comprehensive time travel guide
- Clinical Trial Data Lake - Complete real-world workflow
