Skip to content

Commit 7139c6b

Browse files
committed
Add tests for columns parameter type casting vs column selection
1 parent 3f1cc84 commit 7139c6b

1 file changed

Lines changed: 147 additions & 0 deletions

File tree

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""
2+
Test for columns parameter type casting vs column selection issue.
3+
4+
This test verifies that the `columns` parameter in Sling class is used for
5+
type casting (e.g., casting a string to timestamp) and NOT for column selection.
6+
7+
Issue: When using `columns={"created_at": "timestamp"}` with Arrow input,
8+
only the `created_at` column was being returned instead of all columns with
9+
`created_at` cast to timestamp type.
10+
11+
See: https://github.com/slingdata-io/sling-cli/issues/XXX
12+
"""
13+
import os
14+
import tempfile
15+
import pytest
16+
17+
from sling import Sling
18+
19+
20+
class TestColumnsTypeCasting:
21+
"""Test that columns parameter applies type casting, not column selection."""
22+
23+
def test_columns_casts_types_not_selects_with_arrow_input(self):
24+
"""
25+
Test that columns parameter casts column types instead of selecting columns.
26+
27+
When using input data with Arrow format and specifying columns for type casting,
28+
ALL columns should be present in the output, with the specified columns cast
29+
to their target types.
30+
"""
31+
with tempfile.TemporaryDirectory() as tmpdir:
32+
output_file = os.path.join(tmpdir, "output.csv")
33+
34+
# Sample data with multiple columns
35+
sample_data = [
36+
{"id": 1, "name": "Alice", "created_at": "2024-01-15", "value": 100},
37+
{"id": 2, "name": "Bob", "created_at": "2024-02-20", "value": 200},
38+
]
39+
40+
# Create Sling with columns to cast 'created_at' to timestamp
41+
# This should cast the type, NOT select only this column
42+
sling = Sling(
43+
input=sample_data,
44+
tgt_object=f"file://{output_file}",
45+
columns={"created_at": "timestamp"}, # Should cast, not select
46+
)
47+
48+
sling.run(print_output=False)
49+
50+
# Read output and verify ALL columns are present
51+
import csv
52+
with open(output_file, 'r') as f:
53+
reader = csv.DictReader(f)
54+
rows = list(reader)
55+
56+
assert len(rows) == 2, f"Expected 2 rows, got {len(rows)}"
57+
58+
# Get the column names from the output
59+
output_columns = set(rows[0].keys())
60+
61+
# Verify ALL original columns are present (columns is for casting, not selection)
62+
assert 'id' in output_columns, \
63+
f"id column should exist - columns is for type casting, not selection. Got columns: {output_columns}"
64+
assert 'name' in output_columns, \
65+
f"name column should exist - columns is for type casting, not selection. Got columns: {output_columns}"
66+
assert 'created_at' in output_columns, \
67+
f"created_at column should exist. Got columns: {output_columns}"
68+
assert 'value' in output_columns, \
69+
f"value column should exist - columns is for type casting, not selection. Got columns: {output_columns}"
70+
71+
# Verify data integrity
72+
assert rows[0]['name'] == 'Alice'
73+
assert rows[1]['name'] == 'Bob'
74+
75+
def test_select_filters_columns(self):
76+
"""
77+
Test that select parameter actually filters/selects columns.
78+
79+
This is the expected behavior for column selection - use `select`, not `columns`.
80+
"""
81+
with tempfile.TemporaryDirectory() as tmpdir:
82+
output_file = os.path.join(tmpdir, "output.csv")
83+
84+
sample_data = [
85+
{"id": 1, "name": "Alice", "created_at": "2024-01-15", "value": 100},
86+
{"id": 2, "name": "Bob", "created_at": "2024-02-20", "value": 200},
87+
]
88+
89+
# Create Sling with select to filter columns
90+
sling = Sling(
91+
input=sample_data,
92+
tgt_object=f"file://{output_file}",
93+
select=["id", "name"], # Should filter/select columns
94+
)
95+
96+
sling.run(print_output=False)
97+
98+
import csv
99+
with open(output_file, 'r') as f:
100+
reader = csv.DictReader(f)
101+
rows = list(reader)
102+
103+
output_columns = set(rows[0].keys())
104+
105+
# With select, we expect ONLY the selected columns
106+
assert output_columns == {"id", "name"}, \
107+
f"Expected only selected columns (id, name), got: {output_columns}"
108+
109+
def test_columns_and_select_together(self):
110+
"""
111+
Test using both columns (for casting) and select (for filtering) together.
112+
"""
113+
with tempfile.TemporaryDirectory() as tmpdir:
114+
output_file = os.path.join(tmpdir, "output.csv")
115+
116+
sample_data = [
117+
{"id": 1, "name": "Alice", "created_at": "2024-01-15", "value": 100},
118+
{"id": 2, "name": "Bob", "created_at": "2024-02-20", "value": 200},
119+
]
120+
121+
# Use select to filter AND columns to cast type
122+
sling = Sling(
123+
input=sample_data,
124+
tgt_object=f"file://{output_file}",
125+
select=["id", "name", "created_at"], # Filter to these 3 columns
126+
columns={"created_at": "timestamp"}, # Cast created_at to timestamp
127+
)
128+
129+
sling.run(print_output=False)
130+
131+
import csv
132+
with open(output_file, 'r') as f:
133+
reader = csv.DictReader(f)
134+
rows = list(reader)
135+
136+
output_columns = set(rows[0].keys())
137+
138+
# Should have the 3 selected columns
139+
assert output_columns == {"id", "name", "created_at"}, \
140+
f"Expected selected columns (id, name, created_at), got: {output_columns}"
141+
142+
# value should NOT be present (filtered out by select)
143+
assert 'value' not in output_columns
144+
145+
146+
if __name__ == "__main__":
147+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)