packaging

temp
2026-06-04 01:35:55 +03:00 · 2026-05-10 14:39:54 +03:00
9 changed files with 2763 additions and 404 deletions
--- a/flatten-step3.py
+++ b/flatten-step3.py
@@ -1,61 +1,61 @@
-import shutil
-from pathlib import Path
-
-def flatten_gpx_files(source_dir: str = ".", target_dir: str = "ALL_GPX"):
-    """
-    Find all .gpx files under source_dir (including subfolders)
-    and copy them into a single flat folder.
-    """
-    source_path = Path(source_dir).resolve()
-    target_path = Path(target_dir).resolve()
-
-    # Create target folder
-    target_path.mkdir(parents=True, exist_ok=True)
-
-    print(f"Searching for .gpx files in: {source_path}")
-    print(f"Copying to flat folder: {target_path}\n")
-
-    gpx_files = list(source_path.rglob("*.gpx"))
-    
-    if not gpx_files:
-        print("No .gpx files found.")
-        return
-
-    copied = 0
-    for gpx_file in gpx_files:
-        # New filename: original_name__parent_folder.gpx  (helps avoid name collisions)
-        parent_name = gpx_file.parent.name
-        new_name = f"{gpx_file.stem}__{parent_name}{gpx_file.suffix}"
-        
-        destination = target_path / new_name
-
-        # If filename already exists, add a number
-        counter = 1
-        while destination.exists():
-            destination = target_path / f"{gpx_file.stem}__{parent_name}_{counter}{gpx_file.suffix}"
-            counter += 1
-
-        try:
-            shutil.copy2(gpx_file, destination)
-            print(f"Copied: {gpx_file.name}  →  {new_name}")
-            copied += 1
-        except Exception as e:
-            print(f"Failed {gpx_file.name}: {e}")
-
-    print("\n" + "="*50)
-    print(f"Done! {copied} .gpx files flattened into '{target_path.name}/'")
-    print("="*50)
-
-
-if __name__ == "__main__":
-    import argparse
-    
-    parser = argparse.ArgumentParser(description="Flatten all .gpx files into one folder")
-    parser.add_argument("source", nargs="?", default=".", 
-                        help="Source directory to search (default: current)")
-    parser.add_argument("-o", "--output", default="ALL_GPX", 
-                        help="Output folder name (default: ALL_GPX)")
-    
-    args = parser.parse_args()
-    
+import shutil
+from pathlib import Path
+
+def flatten_gpx_files(source_dir: str = ".", target_dir: str = "ALL_GPX"):
+    """
+    Find all .gpx files under source_dir (including subfolders)
+    and copy them into a single flat folder.
+    """
+    source_path = Path(source_dir).resolve()
+    target_path = Path(target_dir).resolve()
+
+    # Create target folder
+    target_path.mkdir(parents=True, exist_ok=True)
+
+    print(f"Searching for .gpx files in: {source_path}")
+    print(f"Copying to flat folder: {target_path}\n")
+
+    gpx_files = list(source_path.rglob("*.gpx"))
+    
+    if not gpx_files:
+        print("No .gpx files found.")
+        return
+
+    copied = 0
+    for gpx_file in gpx_files:
+        # New filename: original_name__parent_folder.gpx  (helps avoid name collisions)
+        parent_name = gpx_file.parent.name
+        new_name = f"{gpx_file.stem}__{parent_name}{gpx_file.suffix}"
+        
+        destination = target_path / new_name
+
+        # If filename already exists, add a number
+        counter = 1
+        while destination.exists():
+            destination = target_path / f"{gpx_file.stem}__{parent_name}_{counter}{gpx_file.suffix}"
+            counter += 1
+
+        try:
+            shutil.copy2(gpx_file, destination)
+            print(f"Copied: {gpx_file.name}  →  {new_name}")
+            copied += 1
+        except Exception as e:
+            print(f"Failed {gpx_file.name}: {e}")
+
+    print("\n" + "="*50)
+    print(f"Done! {copied} .gpx files flattened into '{target_path.name}/'")
+    print("="*50)
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Flatten all .gpx files into one folder")
+    parser.add_argument("source", nargs="?", default=".", 
+                        help="Source directory to search (default: current)")
+    parser.add_argument("-o", "--output", default="ALL_GPX", 
+                        help="Output folder name (default: ALL_GPX)")
+    
+    args = parser.parse_args()
+    
    flatten_gpx_files(args.source, args.output)
--- a/packaging/route_packager.py
+++ b/packaging/route_packager.py
--- a/packaging/route_packager_README.md
+++ b/packaging/route_packager_README.md
@@ -0,0 +1,114 @@
+# Route Packager — OsmAnd + Google KMZ
+
+This package contains `route_packager.py`, a Python CLI for the scraped route archive tree in `v4-scrape-675.tar.gz`.
+
+## What it does
+
+- Scans the canonical `downloads_by_hash/` folder by default, avoiding retry/failure duplicate folders.
+- Extracts ZIP, RAR, 7Z payloads.
+- Imports GPX, KML, KMZ routes directly.
+- Converts Garmin GDB to GPX when `gpsbabel` is installed.
+- Carries over scrape metadata from `*.source.json` into route descriptions and reports.
+- Carries over small text files into route descriptions.
+- Embeds image files into the KMZ and OSF package under `media/...` and references them from descriptions.
+- Writes machine-readable `report.json` and `routes.csv` for validation/pre-push review.
+
+## Outputs
+
+Default command creates all of these:
+
+- `<name>.osmand-tracks.osf` — OsmAnd package/zip containing normalized GPX tracks.
+- `<name>.all-routes.gpx` — fallback aggregate GPX for OsmAnd if the OSF package import is not accepted by a particular build.
+- `<name>.google-earth-maps.kmz` — binary zipped KML package for Google My Maps / Google Earth.
+- `<name>.google-earth-maps.kml` — plain KML fallback.
+- `<name>.report.json` — full import/skip/warning report.
+- `<name>.routes.csv` — route list with source archive, point count, distance, bbox, etc.
+
+## Install extraction/conversion tools
+
+The script itself is stdlib-only for ZIP/KML/KMZ/GPX. RAR and GDB need external tools:
+
+```bash
+sudo apt update
+sudo apt install p7zip-full unrar-free gpsbabel
+```
+
+If `unrar-free` fails on some RAR versions, install one of `unrar`, `unar`, or `bsdtar` depending on your distro.
+
+## Full run
+
+```bash
+python3 route_packager.py \
+  --input ./v4-scrape-675.tar.gz \
+  --out ./route-out \
+  --target both \
+  --name bg-mountain-routes \
+  --verbose
+```
+
+## Google My Maps one-file safe mode
+
+Google My Maps has a small uncompressed KML/KMZ import ceiling. For a single importable KMZ, use safe mode. It simplifies only the Google output; OsmAnd keeps full GPX detail.
+
+```bash
+python3 route_packager.py \
+  --input ./v4-scrape-675.tar.gz \
+  --out ./route-out-google-safe \
+  --target google \
+  --name bg-mountain-routes \
+  --google-my-maps-safe
+```
+
+Manual geometry cap if needed:
+
+```bash
+python3 route_packager.py --input ./v4-scrape-675.tar.gz --out ./out --target google \
+  --google-max-points-per-route 500 --google-lean-descriptions
+```
+
+## ZIP-only validation without RAR tooling
+
+```bash
+python3 route_packager.py \
+  --input ./v4-scrape-675.tar.gz \
+  --out ./route-out-zip-only \
+  --target both \
+  --skip-rar \
+  --name bg-mountain-routes-zip-only
+```
+
+## Strict CI/pre-push mode
+
+```bash
+python3 route_packager.py \
+  --input ./v4-scrape-675.tar.gz \
+  --out ./route-out \
+  --target both \
+  --name bg-mountain-routes \
+  --strict
+```
+
+`--strict` exits non-zero if RARs/GDBs fail to import.
+
+## Validation commands
+
+```bash
+python3 -m py_compile route_packager.py
+python3 route_packager.py --input ./v4-scrape-675.tar.gz --out ./out --target both --skip-rar --name smoke
+python3 - <<'PY'
+from pathlib import Path
+import zipfile, xml.etree.ElementTree as ET
+out = Path('./out')
+for name in ['smoke.osmand-tracks.osf', 'smoke.google-earth-maps.kmz']:
+    with zipfile.ZipFile(out / name) as z:
+        assert z.testzip() is None, name
+ET.parse(out / 'smoke.all-routes.gpx')
+with zipfile.ZipFile(out / 'smoke.google-earth-maps.kmz') as z:
+    ET.fromstring(z.read('doc.kml'))
+print('OK')
+PY
+```
+
+## Format note
+
+There is no arbitrary custom binary route format for Google Maps imports. KMZ is the practical binary container because it is zipped KML and can include images/media. For OsmAnd, the script emits an OSF-style package plus a GPX fallback because GPX is OsmAnd's most predictable track import path.
--- a/packaging/start.sh
+++ b/packaging/start.sh
@@ -0,0 +1,9 @@
+# sudo apt update
+# sudo apt install p7zip-full unrar-free gpsbabel
+
+python3 route_packager.py \
+  --input ./v4-scrape-675.tar.gz \
+  --out ./route-out \
+  --target both \
+  --name bg-mountain-routes \
+  --verbose
--- a/packaging/v4-scrape-675.tar.gz
+++ b/packaging/v4-scrape-675.tar.gz
--- a/parse-archive-structure-step-2.py
+++ b/parse-archive-structure-step-2.py
@@ -1,100 +1,100 @@
-import os
-from pathlib import Path
-
-def rename_files_in_folders(root_dir: str = ".", recursive: bool = True, dry_run: bool = False):
-    """
-    Rename files by appending the top-level folder name.
-    
-    Example: image.jpg inside 'extracted_MyBackup' becomes image_MyBackup.jpg
-    """
-    base_path = Path(root_dir).resolve()
-    
-    if not base_path.exists():
-        print(f"Error: Folder '{base_path}' does not exist.")
-        return
-    
-    print(f"Scanning folder: {base_path}")
-    if dry_run:
-        print("*** DRY RUN MODE - No files will be renamed ***\n")
-    
-    renamed_count = 0
-    skipped_count = 0
-    
-    # Walk through all directories
-    for dir_path in base_path.rglob("*") if recursive else base_path.iterdir():
-        if not dir_path.is_dir():
-            continue
-            
-        folder_name = dir_path.name
-        
-        # Skip root folder itself and hidden folders
-        if folder_name.startswith('.') or folder_name == base_path.name:
-            continue
-            
-        print(f"\nProcessing folder: {folder_name}")
-        
-        for file_path in dir_path.iterdir():
-            if not file_path.is_file():
-                continue
-                
-            # Get file components
-            original_name = file_path.stem      # filename without extension
-            ext = file_path.suffix              # .jpg, .png, etc.
-            
-            # Skip if file already has the folder name (to avoid double renaming)
-            if folder_name in original_name:
-                print(f"  Skipped (already processed): {file_path.name}")
-                skipped_count += 1
-                continue
-            
-            # New filename: originalName_folderName.ext
-            new_name = f"{original_name}_{folder_name}{ext}"
-            new_path = file_path.parent / new_name
-            
-            # Check if target file already exists
-            if new_path.exists():
-                print(f"  Warning: Target already exists → {new_name}")
-                skipped_count += 1
-                continue
-            
-            try:
-                if dry_run:
-                    print(f"  Would rename: {file_path.name}  →  {new_name}")
-                else:
-                    file_path.rename(new_path)
-                    print(f"  Renamed: {file_path.name}  →  {new_name}")
-                renamed_count += 1
-            except Exception as e:
-                print(f"  Error renaming {file_path.name}: {e}")
-                skipped_count += 1
-    
-    print("\n" + "="*60)
-    print("Renaming completed!")
-    print(f"Files renamed:     {renamed_count}")
-    print(f"Files skipped:     {skipped_count}")
-    if dry_run:
-        print("This was a DRY RUN — no actual changes were made.")
-    print("="*60)
-
-
-if __name__ == "__main__":
-    import argparse
-    
-    parser = argparse.ArgumentParser(
-        description="Rename files by appending their top-level folder name."
-    )
-    parser.add_argument("folder", nargs="?", default=".", 
-                        help="Root folder to process (default: current directory)")
-    parser.add_argument("-r", "--recursive", action="store_true", 
-                        help="Process subfolders recursively (recommended)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Show what would be renamed without making changes")
-    
-    args = parser.parse_args()
-    
-    # By default we enable recursive since you usually want to process extracted_ folders
-    rename_files_in_folders(
-        root_dir=args.folder,
-        recursive=args.recursive if hasattr(args, 'recursive') else True,
-        dry_run=args.dry_run
+import os
+from pathlib import Path
+
+def rename_files_in_folders(root_dir: str = ".", recursive: bool = True, dry_run: bool = False):
+    """
+    Rename files by appending the top-level folder name.
+    
+    Example: image.jpg inside 'extracted_MyBackup' becomes image_MyBackup.jpg
+    """
+    base_path = Path(root_dir).resolve()
+    
+    if not base_path.exists():
+        print(f"Error: Folder '{base_path}' does not exist.")
+        return
+    
+    print(f"Scanning folder: {base_path}")
+    if dry_run:
+        print("*** DRY RUN MODE - No files will be renamed ***\n")
+    
+    renamed_count = 0
+    skipped_count = 0
+    
+    # Walk through all directories
+    for dir_path in base_path.rglob("*") if recursive else base_path.iterdir():
+        if not dir_path.is_dir():
+            continue
+            
+        folder_name = dir_path.name
+        
+        # Skip root folder itself and hidden folders
+        if folder_name.startswith('.') or folder_name == base_path.name:
+            continue
+            
+        print(f"\nProcessing folder: {folder_name}")
+        
+        for file_path in dir_path.iterdir():
+            if not file_path.is_file():
+                continue
+                
+            # Get file components
+            original_name = file_path.stem      # filename without extension
+            ext = file_path.suffix              # .jpg, .png, etc.
+            
+            # Skip if file already has the folder name (to avoid double renaming)
+            if folder_name in original_name:
+                print(f"  Skipped (already processed): {file_path.name}")
+                skipped_count += 1
+                continue
+            
+            # New filename: originalName_folderName.ext
+            new_name = f"{original_name}_{folder_name}{ext}"
+            new_path = file_path.parent / new_name
+            
+            # Check if target file already exists
+            if new_path.exists():
+                print(f"  Warning: Target already exists → {new_name}")
+                skipped_count += 1
+                continue
+            
+            try:
+                if dry_run:
+                    print(f"  Would rename: {file_path.name}  →  {new_name}")
+                else:
+                    file_path.rename(new_path)
+                    print(f"  Renamed: {file_path.name}  →  {new_name}")
+                renamed_count += 1
+            except Exception as e:
+                print(f"  Error renaming {file_path.name}: {e}")
+                skipped_count += 1
+    
+    print("\n" + "="*60)
+    print("Renaming completed!")
+    print(f"Files renamed:     {renamed_count}")
+    print(f"Files skipped:     {skipped_count}")
+    if dry_run:
+        print("This was a DRY RUN — no actual changes were made.")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Rename files by appending their top-level folder name."
+    )
+    parser.add_argument("folder", nargs="?", default=".", 
+                        help="Root folder to process (default: current directory)")
+    parser.add_argument("-r", "--recursive", action="store_true", 
+                        help="Process subfolders recursively (recommended)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Show what would be renamed without making changes")
+    
+    args = parser.parse_args()
+    
+    # By default we enable recursive since you usually want to process extracted_ folders
+    rename_files_in_folders(
+        root_dir=args.folder,
+        recursive=args.recursive if hasattr(args, 'recursive') else True,
+        dry_run=args.dry_run
    )
--- a/parse-archives.py
+++ b/parse-archives.py
@@ -1,75 +1,75 @@
-import os
-import sys
-from pathlib import Path
-
-# Install these first:
-# pip install patool py7zr rarfile
-
-try:
-    from patoolib import extract_archive
-except ImportError:
-    print("Error: 'patool' is not installed. Run: pip install patool py7zr rarfile")
-    sys.exit(1)
-
-def extract_archive_to_folder(archive_path: Path, base_dir: Path):
-    """Extract a single archive to extracted_{name} folder."""
-    # Get archive name without extension
-    name_without_ext = archive_path.stem
-    extract_dir = base_dir / f"bg_mountains_{name_without_ext}"
-    
-    # Create the output directory if it doesn't exist
-    extract_dir.mkdir(parents=True, exist_ok=True)
-    
-    print(f"Extracting: {archive_path.name} → {extract_dir.name}/")
-    
-    try:
-        # patool automatically detects format (zip, rar, 7z, etc.)
-        extract_archive(str(archive_path), outdir=str(extract_dir), verbosity=0)
-        print(f"✓ Successfully extracted: {archive_path.name}\n")
-    except Exception as e:
-        print(f"✗ Failed to extract {archive_path.name}: {e}\n")
-
-def main(folder_path: str = ".", recursive: bool = False):
-    base_dir = Path(folder_path).resolve()
-    
-    if not base_dir.exists():
-        print(f"Error: Folder '{base_dir}' does not exist.")
-        return
-    
-    print(f"Scanning for archives in: {base_dir}\n")
-    
-    # Supported extensions
-    extensions = {'.zip', '.rar', '.7z'}
-    
-    # Find all matching archives
-    if recursive:
-        archive_files = [p for p in base_dir.rglob("*") if p.is_file() and p.suffix.lower() in extensions]
-    else:
-        archive_files = [p for p in base_dir.iterdir() if p.is_file() and p.suffix.lower() in extensions]
-    
-    if not archive_files:
-        print("No .zip, .rar, or .7z files found.")
-        return
-    
-    print(f"Found {len(archive_files)} archive(s).\n")
-    
-    for archive in sorted(archive_files):
-        extract_archive_to_folder(archive, base_dir)
-    
-    print("Extraction process completed!")
-
-if __name__ == "__main__":
-    # Usage examples:
-    # python extract_archives.py                    # current folder, non-recursive
-    # python extract_archives.py "/path/to/folder"   # specific folder
-    # python extract_archives.py "/path/to/folder" --recursive
-    
-    import argparse
-    
-    parser = argparse.ArgumentParser(description="Extract .zip/.rar/.7z archives into separate folders.")
-    parser.add_argument("folder", nargs="?", default=".", help="Folder to scan (default: current directory)")
-    parser.add_argument("-r", "--recursive", action="store_true", help="Search subfolders recursively")
-    
-    args = parser.parse_args()
-    
+import os
+import sys
+from pathlib import Path
+
+# Install these first:
+# pip install patool py7zr rarfile
+
+try:
+    from patoolib import extract_archive
+except ImportError:
+    print("Error: 'patool' is not installed. Run: pip install patool py7zr rarfile")
+    sys.exit(1)
+
+def extract_archive_to_folder(archive_path: Path, base_dir: Path):
+    """Extract a single archive to extracted_{name} folder."""
+    # Get archive name without extension
+    name_without_ext = archive_path.stem
+    extract_dir = base_dir / f"bg_mountains_{name_without_ext}"
+    
+    # Create the output directory if it doesn't exist
+    extract_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"Extracting: {archive_path.name} → {extract_dir.name}/")
+    
+    try:
+        # patool automatically detects format (zip, rar, 7z, etc.)
+        extract_archive(str(archive_path), outdir=str(extract_dir), verbosity=0)
+        print(f"✓ Successfully extracted: {archive_path.name}\n")
+    except Exception as e:
+        print(f"✗ Failed to extract {archive_path.name}: {e}\n")
+
+def main(folder_path: str = ".", recursive: bool = False):
+    base_dir = Path(folder_path).resolve()
+    
+    if not base_dir.exists():
+        print(f"Error: Folder '{base_dir}' does not exist.")
+        return
+    
+    print(f"Scanning for archives in: {base_dir}\n")
+    
+    # Supported extensions
+    extensions = {'.zip', '.rar', '.7z'}
+    
+    # Find all matching archives
+    if recursive:
+        archive_files = [p for p in base_dir.rglob("*") if p.is_file() and p.suffix.lower() in extensions]
+    else:
+        archive_files = [p for p in base_dir.iterdir() if p.is_file() and p.suffix.lower() in extensions]
+    
+    if not archive_files:
+        print("No .zip, .rar, or .7z files found.")
+        return
+    
+    print(f"Found {len(archive_files)} archive(s).\n")
+    
+    for archive in sorted(archive_files):
+        extract_archive_to_folder(archive, base_dir)
+    
+    print("Extraction process completed!")
+
+if __name__ == "__main__":
+    # Usage examples:
+    # python extract_archives.py                    # current folder, non-recursive
+    # python extract_archives.py "/path/to/folder"   # specific folder
+    # python extract_archives.py "/path/to/folder" --recursive
+    
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Extract .zip/.rar/.7z archives into separate folders.")
+    parser.add_argument("folder", nargs="?", default=".", help="Folder to scan (default: current directory)")
+    parser.add_argument("-r", "--recursive", action="store_true", help="Search subfolders recursively")
+    
+    args = parser.parse_args()
+    
    main(args.folder, args.recursive)
--- a/readme.md
+++ b/readme.md
--- a/visit.py
+++ b/visit.py
@@ -1,172 +1,172 @@
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
-import time
-import os
-
-# ========================= CONFIG =========================
-PROXY_IP = "192.168.0.38"
-PROXY_PORT = "1080"
-DOWNLOAD_PATH = os.path.join(os.getcwd(), "downloads")
-LOG_FILE = "download_log.txt"
-
-os.makedirs(DOWNLOAD_PATH, exist_ok=True)
-
-# ====================== CHROME OPTIONS (Optimized for speed) ======================
-chrome_options = Options()
-
-# Proxy & basic stealth
-chrome_options.add_argument(f'--proxy-server=socks5://{PROXY_IP}:{PROXY_PORT}')
-chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
-chrome_options.add_argument("--no-sandbox")
-chrome_options.add_argument("--disable-dev-shm-usage")
-chrome_options.add_argument("--disable-blink-features=AutomationControlled")
-chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
-chrome_options.add_experimental_option('useAutomationExtension', False)
-
-# === SPEED OPTIMIZATIONS ===
-chrome_options.add_argument("--blink-settings=imagesEnabled=false")   # Disable images
-chrome_options.add_argument("--disable-gpu")
-chrome_options.add_argument("--disable-extensions")
-chrome_options.add_argument("--disable-plugins")
-chrome_options.add_argument("--disable-popup-blocking")
-
-# Fast page load strategy (doesn't wait for images/media)
-chrome_options.page_load_strategy = "eager"   # or "none" if you want even faster (but less stable)
-
-# Block media, stylesheets, fonts, etc.
-prefs = {
-    "download.default_directory": DOWNLOAD_PATH,
-    "download.prompt_for_download": False,
-    "download.directory_upgrade": True,
-    "safebrowsing.enabled": True,
-    "profile.default_content_setting_values.automatic_downloads": 1,
-    "profile.managed_default_content_settings.images": 2,        # 2 = block images
-    "profile.managed_default_content_settings.stylesheets": 2,   # block CSS
-    "profile.managed_default_content_settings.fonts": 2,         # block fonts
-    "profile.managed_default_content_settings.media_stream": 2,  # block video/audio
-}
-
-chrome_options.add_experimental_option("prefs", prefs)
-
-# Optional: Run headless (much faster, recommended)
-# chrome_options.add_argument("--headless=new")
-
-# Initialize driver
-driver = webdriver.Chrome(options=chrome_options)
-
-def wait_for_page_load(driver, timeout=20):
-    """Faster page load wait"""
-    try:
-        WebDriverWait(driver, timeout).until(
-            lambda d: d.execute_script("return document.readyState") in ["interactive", "complete"]
-        )
-    except TimeoutException:
-        print("   → Page load timed out, continuing anyway...")
-
-def is_download_finished(download_path, timeout=120, check_interval=2):
-    """Slightly faster download checker"""
-    end_time = time.time() + timeout
-    while time.time() < end_time:
-        partial_files = [f for f in os.listdir(download_path) if f.endswith(('.crdownload', '.part', '.tmp'))]
-        completed_files = [f for f in os.listdir(download_path) if not f.endswith(('.crdownload', '.part', '.tmp'))]
-
-        if completed_files and not partial_files:
-            return True, completed_files
-
-        if partial_files:
-            print(f"   → Still downloading... ({len(partial_files)} partial)")
-        time.sleep(check_interval)
-    return False, []
-
-def clear_partial_downloads():
-    for filename in os.listdir(DOWNLOAD_PATH):
-        if filename.endswith(('.crdownload', '.part', '.tmp')):
-            try:
-                os.remove(os.path.join(DOWNLOAD_PATH, filename))
-            except:
-                pass
-
-def log_result(url: str, success: bool):
-    status = "True" if success else "False"
-    with open(LOG_FILE, "a", encoding="utf-8") as f:
-        f.write(f"{url} | {status}\n")
-    print(f"   Logged: {'SUCCESS' if success else 'FAILED'}")
-
-def run_download(url: str):
-    try:
-        print(f"\n[{time.strftime('%H:%M:%S')}] Processing → {url}")
-        clear_partial_downloads()
-
-        print("   Navigating...")
-        driver.get(url)
-
-        wait_for_page_load(driver, timeout=20)
-        print("   ✓ Page loaded (eager).")
-
-        wait = WebDriverWait(driver, 10)  # Reduced timeout
-
-        # === License checkbox (quick check) ===
-        try:
-            checkbox = wait.until(EC.presence_of_element_located((By.NAME, "license_agree")))
-            if checkbox.is_enabled() and checkbox.is_displayed():
-                checkbox.click()
-                print("   ✓ License checkbox accepted.")
-                time.sleep(0.5)
-        except (TimeoutException, NoSuchElementException):
-            print("   → No checkbox found. Continuing...")
-
-        # === Download button (quick detection) ===
-        try:
-            download_btn = wait.until(EC.element_to_be_clickable((By.ID, "jd_license_submit")))
-            download_btn.click()
-            print("   ✓ Download button clicked.")
-        except (TimeoutException, NoSuchElementException):
-            print("   ✗ Download button NOT found or not clickable → Skipping")
-            log_result(url, False)
-            return False
-
-        # === Wait for download ===
-        print("   Waiting for download (max 2 min)...")
-        success, files = is_download_finished(DOWNLOAD_PATH, timeout=120)
-        if success and files:
-            print(f"   ✓ Download completed! Files: {files}")
-            log_result(url, True)
-            return True
-        else:
-            print("   ✗ Download timed out.")
-            log_result(url, False)
-            return False
-
-    except WebDriverException as e:
-        print(f"   ✗ WebDriver error: {e}")
-        log_result(url, False)
-        return False
-    except Exception as e:
-        print(f"   ✗ Unexpected error: {e}")
-        log_result(url, False)
-        return False
-
-# ====================== MAIN ======================
-if __name__ == "__main__":
-    with open("url-final.txt", "r", encoding="utf-8") as f:
-        links = [line.strip() for line in f if line.strip()]
-
-    links = links[780:]  # start from 251st
-    print(f"Loaded {len(links)} URLs. Starting from index 250.")
-
-    success_count = 0
-    for i, url in enumerate(links, 1):
-        print(f"--- [{i}/{len(links)}] ---")
-        if run_download(url):
-            success_count += 1
-        time.sleep(3)  # Reduced delay between requests
-
-    print(f"\n=== FINISHED ===")
-    print(f"Total: {len(links)} | Success: {success_count} | Failed: {len(links) - success_count}")
-    print(f"Log: {LOG_FILE}")
-
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
+import time
+import os
+
+# ========================= CONFIG =========================
+PROXY_IP = "192.168.0.38"
+PROXY_PORT = "1080"
+DOWNLOAD_PATH = os.path.join(os.getcwd(), "downloads")
+LOG_FILE = "download_log.txt"
+
+os.makedirs(DOWNLOAD_PATH, exist_ok=True)
+
+# ====================== CHROME OPTIONS (Optimized for speed) ======================
+chrome_options = Options()
+
+# Proxy & basic stealth
+chrome_options.add_argument(f'--proxy-server=socks5://{PROXY_IP}:{PROXY_PORT}')
+chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
+chrome_options.add_argument("--no-sandbox")
+chrome_options.add_argument("--disable-dev-shm-usage")
+chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+chrome_options.add_experimental_option('useAutomationExtension', False)
+
+# === SPEED OPTIMIZATIONS ===
+chrome_options.add_argument("--blink-settings=imagesEnabled=false")   # Disable images
+chrome_options.add_argument("--disable-gpu")
+chrome_options.add_argument("--disable-extensions")
+chrome_options.add_argument("--disable-plugins")
+chrome_options.add_argument("--disable-popup-blocking")
+
+# Fast page load strategy (doesn't wait for images/media)
+chrome_options.page_load_strategy = "eager"   # or "none" if you want even faster (but less stable)
+
+# Block media, stylesheets, fonts, etc.
+prefs = {
+    "download.default_directory": DOWNLOAD_PATH,
+    "download.prompt_for_download": False,
+    "download.directory_upgrade": True,
+    "safebrowsing.enabled": True,
+    "profile.default_content_setting_values.automatic_downloads": 1,
+    "profile.managed_default_content_settings.images": 2,        # 2 = block images
+    "profile.managed_default_content_settings.stylesheets": 2,   # block CSS
+    "profile.managed_default_content_settings.fonts": 2,         # block fonts
+    "profile.managed_default_content_settings.media_stream": 2,  # block video/audio
+}
+
+chrome_options.add_experimental_option("prefs", prefs)
+
+# Optional: Run headless (much faster, recommended)
+# chrome_options.add_argument("--headless=new")
+
+# Initialize driver
+driver = webdriver.Chrome(options=chrome_options)
+
+def wait_for_page_load(driver, timeout=20):
+    """Faster page load wait"""
+    try:
+        WebDriverWait(driver, timeout).until(
+            lambda d: d.execute_script("return document.readyState") in ["interactive", "complete"]
+        )
+    except TimeoutException:
+        print("   → Page load timed out, continuing anyway...")
+
+def is_download_finished(download_path, timeout=120, check_interval=2):
+    """Slightly faster download checker"""
+    end_time = time.time() + timeout
+    while time.time() < end_time:
+        partial_files = [f for f in os.listdir(download_path) if f.endswith(('.crdownload', '.part', '.tmp'))]
+        completed_files = [f for f in os.listdir(download_path) if not f.endswith(('.crdownload', '.part', '.tmp'))]
+
+        if completed_files and not partial_files:
+            return True, completed_files
+
+        if partial_files:
+            print(f"   → Still downloading... ({len(partial_files)} partial)")
+        time.sleep(check_interval)
+    return False, []
+
+def clear_partial_downloads():
+    for filename in os.listdir(DOWNLOAD_PATH):
+        if filename.endswith(('.crdownload', '.part', '.tmp')):
+            try:
+                os.remove(os.path.join(DOWNLOAD_PATH, filename))
+            except:
+                pass
+
+def log_result(url: str, success: bool):
+    status = "True" if success else "False"
+    with open(LOG_FILE, "a", encoding="utf-8") as f:
+        f.write(f"{url} | {status}\n")
+    print(f"   Logged: {'SUCCESS' if success else 'FAILED'}")
+
+def run_download(url: str):
+    try:
+        print(f"\n[{time.strftime('%H:%M:%S')}] Processing → {url}")
+        clear_partial_downloads()
+
+        print("   Navigating...")
+        driver.get(url)
+
+        wait_for_page_load(driver, timeout=20)
+        print("   ✓ Page loaded (eager).")
+
+        wait = WebDriverWait(driver, 5)  # Reduced timeout
+
+        # === License checkbox (quick check) ===
+        try:
+            checkbox = wait.until(EC.presence_of_element_located((By.NAME, "license_agree")))
+            if checkbox.is_enabled() and checkbox.is_displayed():
+                checkbox.click()
+                print("   ✓ License checkbox accepted.")
+                time.sleep(0.5)
+        except (TimeoutException, NoSuchElementException):
+            print("   → No checkbox found. Continuing...")
+
+        # === Download button (quick detection) ===
+        try:
+            download_btn = wait.until(EC.element_to_be_clickable((By.ID, "jd_license_submit")))
+            download_btn.click()
+            print("   ✓ Download button clicked.")
+        except (TimeoutException, NoSuchElementException):
+            print("   ✗ Download button NOT found or not clickable → Skipping")
+            log_result(url, False)
+            return False
+
+        # === Wait for download ===
+        print("   Waiting for download (max 2 min)...")
+        success, files = is_download_finished(DOWNLOAD_PATH, timeout=30)
+        if success and files:
+            print(f"   ✓ Download completed! Files: {files}")
+            log_result(url, True)
+            return True
+        else:
+            print("   ✗ Download timed out.")
+            log_result(url, False)
+            return False
+
+    except WebDriverException as e:
+        print(f"   ✗ WebDriver error: {e}")
+        log_result(url, False)
+        return False
+    except Exception as e:
+        print(f"   ✗ Unexpected error: {e}")
+        log_result(url, False)
+        return False
+
+# ====================== MAIN ======================
+if __name__ == "__main__":
+    with open("url-final.txt", "r", encoding="utf-8") as f:
+        links = [line.strip() for line in f if line.strip()]
+
+    links = links[780:]  # start from 251st
+    print(f"Loaded {len(links)} URLs. Starting from index 250.")
+
+    success_count = 0
+    for i, url in enumerate(links, 1):
+        print(f"--- [{i}/{len(links)}] ---")
+        if run_download(url):
+            success_count += 1
+        time.sleep(3)  # Reduced delay between requests
+
+    print(f"\n=== FINISHED ===")
+    print(f"Total: {len(links)} | Success: {success_count} | Failed: {len(links) - success_count}")
+    print(f"Log: {LOG_FILE}")
+
    driver.quit()
Author	SHA1	Message	Date
nq	c9fdd24a3d	packaging	2026-06-04 01:35:55 +03:00
nq	f0814fe15b	temp	2026-05-10 14:39:54 +03:00