import os from pathlib import Path from urllib.parse import urlparse import requests import yaml from tqdm import tqdm CONFIG_PATH = "config.yaml" BASE_DOWNLOAD_DIR = Path("downloads") # semua file masuk ke sini def load_config(path: str): with open(path, "r") as f: data = yaml.safe_load(f) if isinstance(data, list): return data elif isinstance(data, dict) and "items" in data: return data["items"] else: raise ValueError("Format config.yaml tidak dikenali") def get_filename_from_url(url: str) -> str: parsed = urlparse(url) name = os.path.basename(parsed.path) if not name: name = "downloaded_file" return name def build_filename(url: str, name_from_config: str | None) -> str: """ Tentukan nama file final: - kalau name di config kosong -> pakai nama dari URL - kalau name TIDAK punya ekstensi -> pakai name + ekstensi dari URL - kalau name sudah ada ekstensi -> pakai name apa adanya """ url_name = get_filename_from_url(url) url_root, url_ext = os.path.splitext(url_name) if not name_from_config: return url_name cfg_root, cfg_ext = os.path.splitext(name_from_config) # kalau nggak ada ekstensi di name config → pakai ekstensi dari URL if not cfg_ext and url_ext: return cfg_root + url_ext # kalau sudah ada ekstensi → pakai apa adanya return name_from_config def maybe_get_filename_from_headers(response: requests.Response, fallback: str) -> str: cd = response.headers.get("content-disposition") if not cd: return fallback parts = cd.split(";") for part in parts: part = part.strip() if part.lower().startswith("filename="): filename = part.split("=", 1)[1].strip().strip('"') if filename: return filename return fallback def download_file(url: str, target_dir: Path, name_from_config: str | None = None): target_dir.mkdir(parents=True, exist_ok=True) # Tentukan nama file awal (dari URL + name di config) planned_name = build_filename(url, name_from_config) dest_path = target_dir / planned_name # Cek apakah file sudah ada if dest_path.exists(): print(f"[SKIP] File already exists: {dest_path}") return print(f"[INFO] Downloading: {url}") resp = requests.get(url, stream=True) resp.raise_for_status() # Kalau server kasih nama file di header, kita override, # tapi hanya kalau user TIDAK set 'name' di config. if name_from_config: filename = planned_name else: filename = maybe_get_filename_from_headers(resp, planned_name) dest_path = target_dir / filename total_size = int(resp.headers.get("content-length", 0)) chunk_size = 8192 progress = tqdm( total=total_size if total_size > 0 else None, unit="B", unit_scale=True, desc=str(filename), ) with open(dest_path, "wb") as f: for chunk in resp.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) progress.update(len(chunk)) progress.close() print(f"[OK] Saved to: {dest_path}") def main(): print(f"[INFO] Loading config from {CONFIG_PATH}") entries = load_config(CONFIG_PATH) for entry in entries: url = entry.get("url") folder = entry.get("folder", "") name = entry.get("name") # boleh None if not url: print("[WARN] Entry tanpa URL, di-skip:", entry) continue target_dir = BASE_DOWNLOAD_DIR / folder download_file(url, target_dir, name_from_config=name) print("[DONE] Semua download selesai.") if __name__ == "__main__": main()