Decoding Examples
Practical examples using Rugo's prototype decoder.
Experimental Feature
These examples use the prototype decoder. For production code, use PyArrow or FastParquet.
Basic Examples
Check and Decode
import rugo.parquet as parquet_meta
filename = "simple_data.parquet"
# Check if decodable
if parquet_meta.can_decode(filename):
# Decode integer column
user_ids = parquet_meta.decode_column(filename, "user_id")
print(f"User IDs: {user_ids[:10]}") # First 10
# Decode string column
names = parquet_meta.decode_column(filename, "name")
print(f"Names: {names[:5]}")
else:
print("File cannot be decoded with Rugo")
Integer Data
# Create simple integer file with PyArrow
import pyarrow as pa
import pyarrow.parquet as pq
# Create data
table = pa.table({
'id': pa.array([1, 2, 3, 4, 5], type=pa.int64()),
'count': pa.array([10, 20, 30, 40, 50], type=pa.int32())
})
# Write uncompressed with PLAIN encoding
pq.write_table(
table,
"integers.parquet",
compression="none",
use_dictionary=False
)
# Decode with Rugo
import rugo.parquet as parquet_meta
ids = parquet_meta.decode_column("integers.parquet", "id")
print(f"IDs: {ids}") # [1, 2, 3, 4, 5]
counts = parquet_meta.decode_column("integers.parquet", "count")
print(f"Counts: {counts}") # [10, 20, 30, 40, 50]
String Data
# Create string file
table = pa.table({
'name': pa.array(['Alice', 'Bob', 'Charlie'], type=pa.string()),
'city': pa.array(['NYC', 'LA', 'SF'], type=pa.string())
})
pq.write_table(
table,
"strings.parquet",
compression="none",
use_dictionary=False
)
# Decode
names = parquet_meta.decode_column("strings.parquet", "name")
print(f"Names: {names}") # ['Alice', 'Bob', 'Charlie']
cities = parquet_meta.decode_column("strings.parquet", "city")
print(f"Cities: {cities}") # ['NYC', 'LA', 'SF']
Complete Example
See decode_example.py in the repository:
import rugo.parquet as parquet_meta
import pyarrow as pa
import pyarrow.parquet as pq
# Create test data
data = {
'id': [1, 2, 3, 4, 5],
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'score': [95, 87, 91, 88, 93]
}
table = pa.table({
'id': pa.array(data['id'], type=pa.int64()),
'name': pa.array(data['name'], type=pa.string()),
'score': pa.array(data['score'], type=pa.int32())
})
# Write uncompressed
filename = "test_decode.parquet"
pq.write_table(
table,
filename,
compression="none",
use_dictionary=False
)
# Check decodability
if parquet_meta.can_decode(filename):
print("File can be decoded!")
# Decode each column
ids = parquet_meta.decode_column(filename, "id")
names = parquet_meta.decode_column(filename, "name")
scores = parquet_meta.decode_column(filename, "score")
# Display results
print("\nDecoded data:")
for i in range(len(ids)):
print(f" ID: {ids[i]}, Name: {names[i]}, Score: {scores[i]}")
else:
print("File cannot be decoded")
Advanced Examples
Validate Before Decoding
def safe_decode(filename, column_name):
"""Safely decode column with validation."""
# First, check if file can be decoded
if not parquet_meta.can_decode(filename):
raise ValueError(f"Cannot decode {filename}")
# Check if column exists
metadata = parquet_meta.read_metadata(filename)
column_names = [c["name"] for c in metadata["schema_columns"]]
if column_name not in column_names:
raise ValueError(f"Column {column_name} not found")
# Decode
return parquet_meta.decode_column(filename, column_name)
# Use it
try:
values = safe_decode("data.parquet", "user_id")
print(f"Successfully decoded {len(values)} values")
except ValueError as e:
print(f"Error: {e}")
Process Multiple Columns
def decode_all_columns(filename):
"""Decode all columns from a file."""
# Check decodability
if not parquet_meta.can_decode(filename):
return None
# Get schema
metadata = parquet_meta.read_metadata(filename)
# Decode each column
result = {}
for col in metadata["schema_columns"]:
col_name = col["name"]
try:
result[col_name] = parquet_meta.decode_column(filename, col_name)
except Exception as e:
print(f"Failed to decode {col_name}: {e}")
return result
# Use it
data = decode_all_columns("simple.parquet")
if data:
for col_name, values in data.items():
print(f"{col_name}: {len(values)} values")
Compare with Metadata
def verify_decode(filename, column_name):
"""Decode and verify against metadata."""
# Read metadata
metadata = parquet_meta.read_metadata(filename)
# Get column info from first row group
rg = metadata["row_groups"][0]
col_info = None
for col in rg["columns"]:
if col["name"] == column_name:
col_info = col
break
if not col_info:
raise ValueError(f"Column {column_name} not found")
# Decode
values = parquet_meta.decode_column(filename, column_name)
# Verify
print(f"Column: {column_name}")
print(f" Expected values: {col_info['num_values']}")
print(f" Decoded values: {len(values)}")
print(f" Metadata min: {col_info['min']}")
print(f" Actual min: {min(values)}")
print(f" Metadata max: {col_info['max']}")
print(f" Actual max: {max(values)}")
return values
Create Compatible Test Data
def create_decodable_parquet(filename, data_dict):
"""Create Parquet file compatible with Rugo decoder."""
import pyarrow as pa
import pyarrow.parquet as pq
# Convert data to PyArrow table
arrays = {}
for col_name, values in data_dict.items():
# Detect type
if isinstance(values[0], str):
arrays[col_name] = pa.array(values, type=pa.string())
elif isinstance(values[0], int):
# Use int64 for larger range
arrays[col_name] = pa.array(values, type=pa.int64())
else:
raise ValueError(f"Unsupported type for {col_name}")
table = pa.table(arrays)
# Write with decoder-compatible settings
pq.write_table(
table,
filename,
compression="none", # No compression
use_dictionary=False, # PLAIN encoding
write_statistics=True # Include statistics
)
print(f"Created {filename}")
return filename
# Use it
data = {
'id': [1, 2, 3, 4, 5],
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'score': [95, 87, 91, 88, 93]
}
filename = create_decodable_parquet("test.parquet", data)
# Verify it works
if parquet_meta.can_decode(filename):
ids = parquet_meta.decode_column(filename, "id")
print(f"Decoded IDs: {ids}")
Error Handling Examples
Handle Unsupported Files
def try_decode(filename, column_name):
"""Try to decode, fall back to PyArrow if needed."""
import rugo.parquet as parquet_meta
if parquet_meta.can_decode(filename):
print("Using Rugo decoder")
return parquet_meta.decode_column(filename, column_name)
else:
print("Using PyArrow (Rugo not compatible)")
import pyarrow.parquet as pq
table = pq.read_table(filename, columns=[column_name])
return table.column(column_name).to_pylist()
# Works with any file
values = try_decode("any_file.parquet", "column_name")
Detailed Error Reporting
def decode_with_diagnostics(filename, column_name):
"""Decode with detailed error reporting."""
metadata = parquet_meta.read_metadata(filename)
# Check compression
rg = metadata["row_groups"][0]
for col in rg["columns"]:
if col["name"] == column_name:
if col["compression_codec"] != "UNCOMPRESSED":
print(f"❌ Compression: {col['compression_codec']}")
return None
if "PLAIN" not in col["encodings"]:
print(f"❌ Encodings: {col['encodings']}")
return None
if col["type"] not in ["INT32", "INT64", "BYTE_ARRAY"]:
print(f"❌ Type: {col['type']}")
return None
# All checks passed
print("✓ All checks passed, decoding...")
return parquet_meta.decode_column(filename, column_name)
Next Steps
- Limitations - Understand constraints
- Overview - Decoder capabilities
- API Reference - Function documentation