Skip to content

Commit 59546c1

Browse files
committed
Refactor Sling class to simplify Arrow usage checks and update test to use an empty DataFrame as input
1 parent bcfa250 commit 59546c1

2 files changed

Lines changed: 11 additions & 23 deletions

File tree

sling/sling/__init__.py

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -700,8 +700,7 @@ def _build_command(self) -> List[str]:
700700
# When input data is provided, we don't add source parameters
701701
# The sling binary will auto-detect stdin
702702
# Set source format to Arrow if using Arrow mode with input data
703-
# BUT only if we're streaming to stdout or the target explicitly uses Arrow
704-
if HAS_ARROW and self._should_use_arrow() and self._should_use_arrow_for_input():
703+
if HAS_ARROW and self._should_use_arrow():
705704
if self.src_options is None:
706705
self.src_options = SourceOptions(format=Format.ARROW, null_if='\\N')
707706
elif isinstance(self.src_options, dict):
@@ -784,7 +783,7 @@ def _build_command(self) -> List[str]:
784783

785784
def _write_input_data_sync(self, stdin: IO, input_data: Any):
786785
"""Write input data to stdin, using Arrow IPC format if available, otherwise CSV"""
787-
if HAS_ARROW and self._should_use_arrow() and self._should_use_arrow_for_input():
786+
if HAS_ARROW and self._should_use_arrow():
788787
self._write_input_data_arrow(stdin, input_data)
789788
else:
790789
self._write_input_data_csv(stdin, input_data)
@@ -794,27 +793,15 @@ def _should_use_arrow(self) -> bool:
794793
# Use Arrow if available and not disabled via env var
795794
return HAS_ARROW and os.environ.get('SLING_USE_ARROW', 'true').lower() != 'false'
796795

797-
def _should_use_arrow_for_input(self) -> bool:
798-
"""Determine if Arrow format should be used for input data"""
799-
# Only use Arrow for input if:
800-
# 1. We're streaming to stdout (no target object)
801-
# 2. Or the target explicitly requests Arrow format
802-
if not self.tgt_object:
803-
# Streaming to stdout
804-
return True
805-
806-
# Check if target format is explicitly set to Arrow
807-
if self.tgt_options:
808-
if isinstance(self.tgt_options, dict) and self.tgt_options.get('format') == Format.ARROW:
809-
return True
810-
elif hasattr(self.tgt_options, 'format') and self.tgt_options.format == Format.ARROW:
811-
return True
812-
813-
# For file targets, don't use Arrow for input unless explicitly requested
814-
return False
815-
816796
def _convert_to_arrow_table(self, input_data: Any) -> pa.Table:
817797
"""Convert input data to Arrow Table"""
798+
# Check for Arrow Dataset
799+
if HAS_ARROW and pa is not None:
800+
# if isinstance(input_data, pa.Dataset):
801+
# return input_data.to_table()
802+
if isinstance(input_data, pa.Table):
803+
return input_data
804+
818805
# Check for pandas DataFrame first
819806
if HAS_PANDAS and pd is not None and isinstance(input_data, pd.DataFrame):
820807
return pa.Table.from_pandas(input_data, preserve_index=False)

sling/tests/test_sling_class.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,9 @@ def test_empty_input_data(self, temp_dir):
564564
"""Test handling of empty input data"""
565565
output_file = os.path.join(temp_dir, "empty.csv")
566566

567+
df = pd.DataFrame({'column1': []})
567568
sling = Sling(
568-
input=[], # Empty list
569+
input=df, # Empty dataframe
569570
tgt_object=f"file://{output_file}",
570571
debug=True
571572
)

0 commit comments

Comments
 (0)