Skip to content

Commit b5f1207

Browse files
committed
chore: exclude data repo from sv processing
Signed-off-by: Mouad BANI <mouad-mb@outlook.com>
1 parent 76e7d30 commit b5f1207

1 file changed

Lines changed: 13 additions & 10 deletions

File tree

services/apps/git_integration/src/crowdgit/services/software_value/software_value_service.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import json
2-
import subprocess
32
import time
43
from decimal import Decimal
54

@@ -10,16 +9,16 @@
109
from crowdgit.services.utils import run_shell_command
1110

1211
_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB
12+
# Repos excluded from software value analysis.
13+
# f7f92577-f258-49f0-b5b4-ba07194ca040: data repo (not a code repo), produces misleading results.
14+
_SOFTWARE_VALUE_EXCLUDED_REPO_IDS = frozenset({"f7f92577-f258-49f0-b5b4-ba07194ca040"})
1315

1416

15-
def _get_repo_size_bytes(repo_path: str) -> int:
17+
async def _get_repo_size_bytes(repo_path: str) -> int:
1618
"""Return total disk usage of repo_path in bytes using du -sb."""
1719
try:
18-
result = subprocess.run(
19-
["du", "-sb", repo_path], capture_output=True, text=True, timeout=120
20-
)
21-
if result.returncode == 0:
22-
return int(result.stdout.split()[0])
20+
output = await run_shell_command(["du", "-sb", repo_path], timeout=120)
21+
return int(output.split()[0])
2322
except Exception:
2423
pass
2524
return 0
@@ -37,9 +36,13 @@ async def run(self, repo_id: str, repo_path: str) -> None:
3736
"""
3837
Triggers software value binary for given repo.
3938
Results are saved into insights database directly.
40-
For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
41-
to avoid OOM; results are identical.
39+
Repos in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS are skipped entirely.
40+
For repos larger than 10 GB, scc is run with --no-large (skipping files >100MB) to avoid OOM.
4241
"""
42+
if repo_id in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS:
43+
self.logger.info(f"Skipping software value for excluded repo {repo_id}")
44+
return
45+
4346
start_time = time.time()
4447
execution_status = ExecutionStatus.SUCCESS
4548
error_code = None
@@ -48,7 +51,7 @@ async def run(self, repo_id: str, repo_path: str) -> None:
4851
try:
4952
cmd = [self.software_value_executable]
5053

51-
repo_size = _get_repo_size_bytes(repo_path)
54+
repo_size = await _get_repo_size_bytes(repo_path)
5255
if repo_size >= _LARGE_REPO_THRESHOLD_BYTES:
5356
self.logger.info(
5457
f"Repo size {repo_size / (1024**3):.1f} GB exceeds threshold — "

0 commit comments

Comments
 (0)