Skip to content

Decoding Examples

Practical examples using Rugo's prototype decoder.

Experimental Feature

These examples use the prototype decoder. For production code, use PyArrow or FastParquet.

Basic Examples

Check and Decode

import rugo.parquet as parquet_meta

filename = "simple_data.parquet"

# Check if decodable
if parquet_meta.can_decode(filename):
    # Decode integer column
    user_ids = parquet_meta.decode_column(filename, "user_id")
    print(f"User IDs: {user_ids[:10]}")  # First 10

    # Decode string column
    names = parquet_meta.decode_column(filename, "name")
    print(f"Names: {names[:5]}")
else:
    print("File cannot be decoded with Rugo")

Integer Data

# Create simple integer file with PyArrow
import pyarrow as pa
import pyarrow.parquet as pq

# Create data
table = pa.table({
    'id': pa.array([1, 2, 3, 4, 5], type=pa.int64()),
    'count': pa.array([10, 20, 30, 40, 50], type=pa.int32())
})

# Write uncompressed with PLAIN encoding
pq.write_table(
    table,
    "integers.parquet",
    compression="none",
    use_dictionary=False
)

# Decode with Rugo
import rugo.parquet as parquet_meta

ids = parquet_meta.decode_column("integers.parquet", "id")
print(f"IDs: {ids}")  # [1, 2, 3, 4, 5]

counts = parquet_meta.decode_column("integers.parquet", "count")
print(f"Counts: {counts}")  # [10, 20, 30, 40, 50]

String Data

# Create string file
table = pa.table({
    'name': pa.array(['Alice', 'Bob', 'Charlie'], type=pa.string()),
    'city': pa.array(['NYC', 'LA', 'SF'], type=pa.string())
})

pq.write_table(
    table,
    "strings.parquet",
    compression="none",
    use_dictionary=False
)

# Decode
names = parquet_meta.decode_column("strings.parquet", "name")
print(f"Names: {names}")  # ['Alice', 'Bob', 'Charlie']

cities = parquet_meta.decode_column("strings.parquet", "city")
print(f"Cities: {cities}")  # ['NYC', 'LA', 'SF']

Complete Example

See decode_example.py in the repository:

import rugo.parquet as parquet_meta
import pyarrow as pa
import pyarrow.parquet as pq

# Create test data
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'score': [95, 87, 91, 88, 93]
}

table = pa.table({
    'id': pa.array(data['id'], type=pa.int64()),
    'name': pa.array(data['name'], type=pa.string()),
    'score': pa.array(data['score'], type=pa.int32())
})

# Write uncompressed
filename = "test_decode.parquet"
pq.write_table(
    table,
    filename,
    compression="none",
    use_dictionary=False
)

# Check decodability
if parquet_meta.can_decode(filename):
    print("File can be decoded!")

    # Decode each column
    ids = parquet_meta.decode_column(filename, "id")
    names = parquet_meta.decode_column(filename, "name")
    scores = parquet_meta.decode_column(filename, "score")

    # Display results
    print("\nDecoded data:")
    for i in range(len(ids)):
        print(f"  ID: {ids[i]}, Name: {names[i]}, Score: {scores[i]}")
else:
    print("File cannot be decoded")

Advanced Examples

Validate Before Decoding

def safe_decode(filename, column_name):
    """Safely decode column with validation."""
    # First, check if file can be decoded
    if not parquet_meta.can_decode(filename):
        raise ValueError(f"Cannot decode {filename}")

    # Check if column exists
    metadata = parquet_meta.read_metadata(filename)
    column_names = [c["name"] for c in metadata["schema_columns"]]

    if column_name not in column_names:
        raise ValueError(f"Column {column_name} not found")

    # Decode
    return parquet_meta.decode_column(filename, column_name)

# Use it
try:
    values = safe_decode("data.parquet", "user_id")
    print(f"Successfully decoded {len(values)} values")
except ValueError as e:
    print(f"Error: {e}")

Process Multiple Columns

def decode_all_columns(filename):
    """Decode all columns from a file."""
    # Check decodability
    if not parquet_meta.can_decode(filename):
        return None

    # Get schema
    metadata = parquet_meta.read_metadata(filename)

    # Decode each column
    result = {}
    for col in metadata["schema_columns"]:
        col_name = col["name"]
        try:
            result[col_name] = parquet_meta.decode_column(filename, col_name)
        except Exception as e:
            print(f"Failed to decode {col_name}: {e}")

    return result

# Use it
data = decode_all_columns("simple.parquet")
if data:
    for col_name, values in data.items():
        print(f"{col_name}: {len(values)} values")

Compare with Metadata

def verify_decode(filename, column_name):
    """Decode and verify against metadata."""
    # Read metadata
    metadata = parquet_meta.read_metadata(filename)

    # Get column info from first row group
    rg = metadata["row_groups"][0]
    col_info = None
    for col in rg["columns"]:
        if col["name"] == column_name:
            col_info = col
            break

    if not col_info:
        raise ValueError(f"Column {column_name} not found")

    # Decode
    values = parquet_meta.decode_column(filename, column_name)

    # Verify
    print(f"Column: {column_name}")
    print(f"  Expected values: {col_info['num_values']}")
    print(f"  Decoded values: {len(values)}")
    print(f"  Metadata min: {col_info['min']}")
    print(f"  Actual min: {min(values)}")
    print(f"  Metadata max: {col_info['max']}")
    print(f"  Actual max: {max(values)}")

    return values

Create Compatible Test Data

def create_decodable_parquet(filename, data_dict):
    """Create Parquet file compatible with Rugo decoder."""
    import pyarrow as pa
    import pyarrow.parquet as pq

    # Convert data to PyArrow table
    arrays = {}
    for col_name, values in data_dict.items():
        # Detect type
        if isinstance(values[0], str):
            arrays[col_name] = pa.array(values, type=pa.string())
        elif isinstance(values[0], int):
            # Use int64 for larger range
            arrays[col_name] = pa.array(values, type=pa.int64())
        else:
            raise ValueError(f"Unsupported type for {col_name}")

    table = pa.table(arrays)

    # Write with decoder-compatible settings
    pq.write_table(
        table,
        filename,
        compression="none",      # No compression
        use_dictionary=False,    # PLAIN encoding
        write_statistics=True    # Include statistics
    )

    print(f"Created {filename}")
    return filename

# Use it
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'score': [95, 87, 91, 88, 93]
}

filename = create_decodable_parquet("test.parquet", data)

# Verify it works
if parquet_meta.can_decode(filename):
    ids = parquet_meta.decode_column(filename, "id")
    print(f"Decoded IDs: {ids}")

Error Handling Examples

Handle Unsupported Files

def try_decode(filename, column_name):
    """Try to decode, fall back to PyArrow if needed."""
    import rugo.parquet as parquet_meta

    if parquet_meta.can_decode(filename):
        print("Using Rugo decoder")
        return parquet_meta.decode_column(filename, column_name)
    else:
        print("Using PyArrow (Rugo not compatible)")
        import pyarrow.parquet as pq
        table = pq.read_table(filename, columns=[column_name])
        return table.column(column_name).to_pylist()

# Works with any file
values = try_decode("any_file.parquet", "column_name")

Detailed Error Reporting

def decode_with_diagnostics(filename, column_name):
    """Decode with detailed error reporting."""
    metadata = parquet_meta.read_metadata(filename)

    # Check compression
    rg = metadata["row_groups"][0]
    for col in rg["columns"]:
        if col["name"] == column_name:
            if col["compression_codec"] != "UNCOMPRESSED":
                print(f"❌ Compression: {col['compression_codec']}")
                return None

            if "PLAIN" not in col["encodings"]:
                print(f"❌ Encodings: {col['encodings']}")
                return None

            if col["type"] not in ["INT32", "INT64", "BYTE_ARRAY"]:
                print(f"❌ Type: {col['type']}")
                return None

    # All checks passed
    print("✓ All checks passed, decoding...")
    return parquet_meta.decode_column(filename, column_name)

Next Steps