11import json
2- import subprocess
32import time
43from decimal import Decimal
54
109from crowdgit .services .utils import run_shell_command
1110
1211_LARGE_REPO_THRESHOLD_BYTES = 10 * 1024 * 1024 * 1024 # 10 GB
12+ # Repos excluded from software value analysis.
13+ # f7f92577-f258-49f0-b5b4-ba07194ca040: data repo (not a code repo), produces misleading results.
14+ _SOFTWARE_VALUE_EXCLUDED_REPO_IDS = frozenset ({"f7f92577-f258-49f0-b5b4-ba07194ca040" })
1315
1416
15- def _get_repo_size_bytes (repo_path : str ) -> int :
17+ async def _get_repo_size_bytes (repo_path : str ) -> int :
1618 """Return total disk usage of repo_path in bytes using du -sb."""
1719 try :
18- result = subprocess .run (
19- ["du" , "-sb" , repo_path ], capture_output = True , text = True , timeout = 120
20- )
21- if result .returncode == 0 :
22- return int (result .stdout .split ()[0 ])
20+ output = await run_shell_command (["du" , "-sb" , repo_path ], timeout = 120 )
21+ return int (output .split ()[0 ])
2322 except Exception :
2423 pass
2524 return 0
@@ -37,9 +36,13 @@ async def run(self, repo_id: str, repo_path: str) -> None:
3736 """
3837 Triggers software value binary for given repo.
3938 Results are saved into insights database directly.
40- For repos larger than 10 GB, scc is run with minimum parallelism (1 worker)
41- to avoid OOM; results are identical .
39+ Repos in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS are skipped entirely.
40+ For repos larger than 10 GB, scc is run with --no-large (skipping files >100MB) to avoid OOM .
4241 """
42+ if repo_id in _SOFTWARE_VALUE_EXCLUDED_REPO_IDS :
43+ self .logger .info (f"Skipping software value for excluded repo { repo_id } " )
44+ return
45+
4346 start_time = time .time ()
4447 execution_status = ExecutionStatus .SUCCESS
4548 error_code = None
@@ -48,7 +51,7 @@ async def run(self, repo_id: str, repo_path: str) -> None:
4851 try :
4952 cmd = [self .software_value_executable ]
5053
51- repo_size = _get_repo_size_bytes (repo_path )
54+ repo_size = await _get_repo_size_bytes (repo_path )
5255 if repo_size >= _LARGE_REPO_THRESHOLD_BYTES :
5356 self .logger .info (
5457 f"Repo size { repo_size / (1024 ** 3 ):.1f} GB exceeds threshold — "
0 commit comments