Files
bg-mountains-scraping-routes/scrape.ipynb
2026-04-14 16:46:00 -07:00

341 lines
7.9 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"id": "0db89081",
"metadata": {},
"outputs": [],
"source": [
"def get_links_from_page(url):\n",
" import requests\n",
" from bs4 import BeautifulSoup\n",
" import re\n",
"\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.text, 'html.parser')\n",
"\n",
" # 2. Find all <a> or <button> tags containing the target text\n",
" # We use re.IGNORECASE to catch variations like \"ИЗТЕГЛЯНЕ\"\n",
" target_text = \"Изтегляне\"\n",
" elements = soup.find_all(['a', 'button'], string=re.compile(target_text, re.IGNORECASE))\n",
"\n",
" download_links = []\n",
"\n",
" for tag in elements:\n",
" # If the tag is directly an <a>, get its href\n",
" if tag.name == 'a' and tag.get('href'):\n",
" download_links.append(tag['href'])\n",
" \n",
" # If it's a <button>, it might be wrapped in an <a> tag\n",
" elif tag.name == 'button':\n",
" parent_a = tag.find_parent('a')\n",
" if parent_a and parent_a.get('href'):\n",
" download_links.append(parent_a['href'])\n",
"\n",
" # # Print results\n",
" # for link in set(download_links): # 'set' removes duplicates\n",
" # print(f\"Found download link: {link}\")\n",
" return download_links"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "1bca41d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Page: 1\n",
"Page: 2\n",
"Page: 3\n",
"Page: 4\n",
"Page: 5\n",
"Page: 6\n",
"Page: 7\n",
"Page: 8\n",
"Page: 9\n",
"Page: 10\n",
"Page: 11\n",
"Page: 12\n",
"Page: 13\n",
"Page: 14\n",
"Page: 15\n",
"Page: 16\n",
"Page: 17\n",
"Page: 18\n",
"Page: 19\n",
"Page: 20\n",
"Page: 21\n",
"Page: 22\n",
"Page: 23\n",
"Page: 24\n",
"Page: 25\n",
"Page: 26\n",
"Page: 27\n",
"Page: 28\n",
"Page: 29\n",
"Page: 30\n",
"Page: 31\n",
"Page: 32\n",
"Page: 33\n",
"Page: 34\n",
"Page: 35\n",
"Page: 36\n",
"Page: 37\n",
"Page: 38\n",
"Page: 39\n",
"Page: 40\n",
"Page: 41\n",
"Page: 42\n",
"Page: 43\n",
"Page: 44\n",
"Page: 45\n",
"Page: 46\n",
"Page: 47\n",
"Page: 48\n",
"Page: 49\n",
"Page: 50\n",
"Page: 51\n",
"Page: 52\n",
"Page: 53\n",
"Page: 54\n",
"Page: 55\n",
"Page: 56\n",
"Page: 57\n",
"Page: 58\n",
"Page: 59\n",
"Page: 60\n",
"Page: 61\n",
"Page: 62\n",
"Page: 63\n",
"Page: 64\n",
"Page: 65\n",
"Page: 66\n",
"Page: 67\n",
"Page: 68\n",
"Page: 69\n",
"Page: 70\n",
"Page: 71\n",
"Page: 72\n",
"Page: 73\n",
"Page: 74\n",
"Page: 75\n",
"Page: 76\n",
"Page: 77\n",
"Page: 78\n",
"Page: 79\n",
"Page: 80\n",
"Page: 81\n",
"Page: 82\n",
"Page: 83\n",
"Page: 84\n",
"Page: 85\n",
"Page: 86\n",
"Page: 87\n",
"Page: 88\n",
"Page: 89\n",
"Page: 90\n",
"Page: 91\n",
"Page: 92\n",
"Page: 93\n",
"Page: 94\n",
"Page: 95\n",
"Page: 96\n",
"Page: 97\n",
"Page: 98\n",
"Page: 99\n",
"Page: 100\n",
"Page: 101\n",
"Page: 102\n",
"Page: 103\n",
"Page: 104\n",
"Page: 105\n",
"Page: 106\n",
"Page: 107\n",
"Page: 108\n",
"Page: 109\n",
"Page: 110\n",
"Page: 111\n",
"Page: 112\n",
"Page: 113\n",
"Page: 114\n",
"Page: 115\n",
"Page: 116\n",
"Page: 117\n",
"Page: 118\n",
"Page: 119\n",
"Page: 120\n",
"Page: 121\n",
"Page: 122\n",
"Page: 123\n",
"Page: 124\n",
"Page: 125\n",
"Page: 126\n",
"Page: 127\n",
"Page: 128\n",
"Page: 129\n",
"Page: 130\n",
"Page: 131\n",
"Page: 132\n",
"Page: 133\n",
"Page: 134\n",
"Page: 135\n",
"Page: 136\n",
"Page: 137\n",
"Page: 138\n",
"Page: 139\n",
"Page: 140\n",
"Page: 141\n",
"Page: 142\n",
"Page: 143\n",
"Page: 144\n",
"Page: 145\n",
"Page: 146\n",
"Page: 147\n",
"Page: 148\n",
"Page: 149\n",
"Page: 150\n",
"Page: 151\n",
"Page: 152\n",
"Page: 153\n",
"Page: 154\n",
"Page: 155\n",
"Page: 156\n",
"Page: 157\n",
"Page: 158\n",
"Page: 159\n",
"Page: 160\n",
"Page: 161\n",
"Page: 162\n",
"Page: 163\n",
"Page: 164\n",
"Page: 165\n",
"Page: 166\n",
"Page: 167\n",
"Page: 168\n",
"Page: 169\n",
"Page: 170\n",
"Page: 171\n",
"Page: 172\n"
]
}
],
"source": [
"import os\n",
"import sys\n",
"import time\n",
"import random\n",
"\n",
"start_num = 0 \n",
"iter_num = 5\n",
"string_plchldr = \"number_start_placeholder\"\n",
"\n",
"link = f\"https://www.bgmountains.org/bg/gps-tracks/cats/all?type=all&start=number_start_placeholder\"\n",
"lim = 172\n",
"#take from website\n",
"list_links = []\n",
"output_links = []\n",
"with open(\"out.log\",\"w\") as f:\n",
" for i in range(lim):\n",
" print(\"Page: \",i+1)\n",
" response_links = get_links_from_page(link.replace(string_plchldr,str(iter_num*i)))\n",
" output_links.append(response_links)\n",
" time.sleep(random.randint(0,6))\n",
" f.write(str(response_links) + \"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d87c4a49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['/bg/gps-tracks/cats/summary/4-belasitsa/1-biloto',\n",
" '/bg/gps-tracks/cats/summary/3-osogovo-belasishka/5-dzhama-kadiitsa',\n",
" '/bg/gps-tracks/cats/summary/33-pirin/144-sandanski-lilyanovo',\n",
" '/bg/gps-tracks/cats/summary/6-osogovo/7-trite-buki-ruen',\n",
" '/bg/gps-tracks/cats/summary/8-maleshevska/11-krupnik']]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_links"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "3de85cf5",
"metadata": {},
"outputs": [],
"source": [
"flat = []\n",
"\n",
"with open(\"url-final.txt\",\"w\") as f:\n",
" for item in output_links:\n",
" for entry in item:\n",
" flat.append(entry)\n",
" f.write(\"https://www.bgmountains.org\"+entry+\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "effc0400",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"858\n"
]
},
{
"data": {
"text/plain": [
"858"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}