341 lines
7.9 KiB
Plaintext
341 lines
7.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "0db89081",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_links_from_page(url):\n",
|
|
" import requests\n",
|
|
" from bs4 import BeautifulSoup\n",
|
|
" import re\n",
|
|
"\n",
|
|
" response = requests.get(url)\n",
|
|
" soup = BeautifulSoup(response.text, 'html.parser')\n",
|
|
"\n",
|
|
" # 2. Find all <a> or <button> tags containing the target text\n",
|
|
" # We use re.IGNORECASE to catch variations like \"ИЗТЕГЛЯНЕ\"\n",
|
|
" target_text = \"Изтегляне\"\n",
|
|
" elements = soup.find_all(['a', 'button'], string=re.compile(target_text, re.IGNORECASE))\n",
|
|
"\n",
|
|
" download_links = []\n",
|
|
"\n",
|
|
" for tag in elements:\n",
|
|
" # If the tag is directly an <a>, get its href\n",
|
|
" if tag.name == 'a' and tag.get('href'):\n",
|
|
" download_links.append(tag['href'])\n",
|
|
" \n",
|
|
" # If it's a <button>, it might be wrapped in an <a> tag\n",
|
|
" elif tag.name == 'button':\n",
|
|
" parent_a = tag.find_parent('a')\n",
|
|
" if parent_a and parent_a.get('href'):\n",
|
|
" download_links.append(parent_a['href'])\n",
|
|
"\n",
|
|
" # # Print results\n",
|
|
" # for link in set(download_links): # 'set' removes duplicates\n",
|
|
" # print(f\"Found download link: {link}\")\n",
|
|
" return download_links"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "1bca41d0",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Page: 1\n",
|
|
"Page: 2\n",
|
|
"Page: 3\n",
|
|
"Page: 4\n",
|
|
"Page: 5\n",
|
|
"Page: 6\n",
|
|
"Page: 7\n",
|
|
"Page: 8\n",
|
|
"Page: 9\n",
|
|
"Page: 10\n",
|
|
"Page: 11\n",
|
|
"Page: 12\n",
|
|
"Page: 13\n",
|
|
"Page: 14\n",
|
|
"Page: 15\n",
|
|
"Page: 16\n",
|
|
"Page: 17\n",
|
|
"Page: 18\n",
|
|
"Page: 19\n",
|
|
"Page: 20\n",
|
|
"Page: 21\n",
|
|
"Page: 22\n",
|
|
"Page: 23\n",
|
|
"Page: 24\n",
|
|
"Page: 25\n",
|
|
"Page: 26\n",
|
|
"Page: 27\n",
|
|
"Page: 28\n",
|
|
"Page: 29\n",
|
|
"Page: 30\n",
|
|
"Page: 31\n",
|
|
"Page: 32\n",
|
|
"Page: 33\n",
|
|
"Page: 34\n",
|
|
"Page: 35\n",
|
|
"Page: 36\n",
|
|
"Page: 37\n",
|
|
"Page: 38\n",
|
|
"Page: 39\n",
|
|
"Page: 40\n",
|
|
"Page: 41\n",
|
|
"Page: 42\n",
|
|
"Page: 43\n",
|
|
"Page: 44\n",
|
|
"Page: 45\n",
|
|
"Page: 46\n",
|
|
"Page: 47\n",
|
|
"Page: 48\n",
|
|
"Page: 49\n",
|
|
"Page: 50\n",
|
|
"Page: 51\n",
|
|
"Page: 52\n",
|
|
"Page: 53\n",
|
|
"Page: 54\n",
|
|
"Page: 55\n",
|
|
"Page: 56\n",
|
|
"Page: 57\n",
|
|
"Page: 58\n",
|
|
"Page: 59\n",
|
|
"Page: 60\n",
|
|
"Page: 61\n",
|
|
"Page: 62\n",
|
|
"Page: 63\n",
|
|
"Page: 64\n",
|
|
"Page: 65\n",
|
|
"Page: 66\n",
|
|
"Page: 67\n",
|
|
"Page: 68\n",
|
|
"Page: 69\n",
|
|
"Page: 70\n",
|
|
"Page: 71\n",
|
|
"Page: 72\n",
|
|
"Page: 73\n",
|
|
"Page: 74\n",
|
|
"Page: 75\n",
|
|
"Page: 76\n",
|
|
"Page: 77\n",
|
|
"Page: 78\n",
|
|
"Page: 79\n",
|
|
"Page: 80\n",
|
|
"Page: 81\n",
|
|
"Page: 82\n",
|
|
"Page: 83\n",
|
|
"Page: 84\n",
|
|
"Page: 85\n",
|
|
"Page: 86\n",
|
|
"Page: 87\n",
|
|
"Page: 88\n",
|
|
"Page: 89\n",
|
|
"Page: 90\n",
|
|
"Page: 91\n",
|
|
"Page: 92\n",
|
|
"Page: 93\n",
|
|
"Page: 94\n",
|
|
"Page: 95\n",
|
|
"Page: 96\n",
|
|
"Page: 97\n",
|
|
"Page: 98\n",
|
|
"Page: 99\n",
|
|
"Page: 100\n",
|
|
"Page: 101\n",
|
|
"Page: 102\n",
|
|
"Page: 103\n",
|
|
"Page: 104\n",
|
|
"Page: 105\n",
|
|
"Page: 106\n",
|
|
"Page: 107\n",
|
|
"Page: 108\n",
|
|
"Page: 109\n",
|
|
"Page: 110\n",
|
|
"Page: 111\n",
|
|
"Page: 112\n",
|
|
"Page: 113\n",
|
|
"Page: 114\n",
|
|
"Page: 115\n",
|
|
"Page: 116\n",
|
|
"Page: 117\n",
|
|
"Page: 118\n",
|
|
"Page: 119\n",
|
|
"Page: 120\n",
|
|
"Page: 121\n",
|
|
"Page: 122\n",
|
|
"Page: 123\n",
|
|
"Page: 124\n",
|
|
"Page: 125\n",
|
|
"Page: 126\n",
|
|
"Page: 127\n",
|
|
"Page: 128\n",
|
|
"Page: 129\n",
|
|
"Page: 130\n",
|
|
"Page: 131\n",
|
|
"Page: 132\n",
|
|
"Page: 133\n",
|
|
"Page: 134\n",
|
|
"Page: 135\n",
|
|
"Page: 136\n",
|
|
"Page: 137\n",
|
|
"Page: 138\n",
|
|
"Page: 139\n",
|
|
"Page: 140\n",
|
|
"Page: 141\n",
|
|
"Page: 142\n",
|
|
"Page: 143\n",
|
|
"Page: 144\n",
|
|
"Page: 145\n",
|
|
"Page: 146\n",
|
|
"Page: 147\n",
|
|
"Page: 148\n",
|
|
"Page: 149\n",
|
|
"Page: 150\n",
|
|
"Page: 151\n",
|
|
"Page: 152\n",
|
|
"Page: 153\n",
|
|
"Page: 154\n",
|
|
"Page: 155\n",
|
|
"Page: 156\n",
|
|
"Page: 157\n",
|
|
"Page: 158\n",
|
|
"Page: 159\n",
|
|
"Page: 160\n",
|
|
"Page: 161\n",
|
|
"Page: 162\n",
|
|
"Page: 163\n",
|
|
"Page: 164\n",
|
|
"Page: 165\n",
|
|
"Page: 166\n",
|
|
"Page: 167\n",
|
|
"Page: 168\n",
|
|
"Page: 169\n",
|
|
"Page: 170\n",
|
|
"Page: 171\n",
|
|
"Page: 172\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"import sys\n",
|
|
"import time\n",
|
|
"import random\n",
|
|
"\n",
|
|
"start_num = 0 \n",
|
|
"iter_num = 5\n",
|
|
"string_plchldr = \"number_start_placeholder\"\n",
|
|
"\n",
|
|
"link = f\"https://www.bgmountains.org/bg/gps-tracks/cats/all?type=all&start=number_start_placeholder\"\n",
|
|
"lim = 172\n",
|
|
"#take from website\n",
|
|
"list_links = []\n",
|
|
"output_links = []\n",
|
|
"with open(\"out.log\",\"w\") as f:\n",
|
|
" for i in range(lim):\n",
|
|
" print(\"Page: \",i+1)\n",
|
|
" response_links = get_links_from_page(link.replace(string_plchldr,str(iter_num*i)))\n",
|
|
" output_links.append(response_links)\n",
|
|
" time.sleep(random.randint(0,6))\n",
|
|
" f.write(str(response_links) + \"\\n\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "d87c4a49",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[['/bg/gps-tracks/cats/summary/4-belasitsa/1-biloto',\n",
|
|
" '/bg/gps-tracks/cats/summary/3-osogovo-belasishka/5-dzhama-kadiitsa',\n",
|
|
" '/bg/gps-tracks/cats/summary/33-pirin/144-sandanski-lilyanovo',\n",
|
|
" '/bg/gps-tracks/cats/summary/6-osogovo/7-trite-buki-ruen',\n",
|
|
" '/bg/gps-tracks/cats/summary/8-maleshevska/11-krupnik']]"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"output_links"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "3de85cf5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"flat = []\n",
|
|
"\n",
|
|
"with open(\"url-final.txt\",\"w\") as f:\n",
|
|
" for item in output_links:\n",
|
|
" for entry in item:\n",
|
|
" flat.append(entry)\n",
|
|
" f.write(\"https://www.bgmountains.org\"+entry+\"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "effc0400",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"858\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"858"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|