1 Star 0 Fork 1

jack2583/PythonExamples

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
mobilePhoneSpecsScrapper.py 4.38 KB
一键复制 编辑 原始数据 按行查看 历史
RishabhSinha07 提交于 2020-10-04 02:23 +08:00 . Create mobilePhoneSpecsScrapper.py
import requests
from bs4 import BeautifulSoup
import csv
import os
import time
import json
class Phonearena():
def __init__(self):
self.phones = []
self.features = ["Brand", "Model Name", "Model Image"]
self.temp1 = []
self.phones_brands = []
self.url = 'https://www.phonearena.com/phones/' # GSMArena website url
# Folder name on which files going to save.
self.new_folder_name = 'GSMArenaDataset'
# It create the absolute path of the GSMArenaDataset folder.
self.absolute_path = os.getcwd().strip() + '/' + self.new_folder_name
def crawl_html_page(self, sub_url):
url = sub_url # Url for html content parsing.
# Handing the connection error of the url.
try:
page = requests.get(url)
# It parses the html data from requested url.
soup = BeautifulSoup(page.text, 'html.parser')
return soup
except ConnectionError as err:
print("Please check your network connection and re-run the script.")
exit()
except Exception:
print("Please check your network connection and re-run the script.")
exit()
def crawl_phone_urls(self):
phones_urls = []
for i in range(1, 238): # Right now they have 237 page of phone data.
print(self.url+"page/"+str(i))
soup = self.crawl_html_page(self.url+"page/"+str(i))
table = soup.findAll("div", {"class": "stream-item"})
table_a = [k.find('a') for k in table]
for a in table_a:
temp = a['href']
phones_urls.append(temp)
return phones_urls
def crawl_phones_models_specification(self, li):
phone_data = {}
for link in li:
print(link)
try:
soup = self.crawl_html_page(link)
model = soup.find(
class_='page__section page__section_quickSpecs')
model_name = model.find("header").h1.text
model_img_html = model.find(class_='head-image')
model_img = model_img_html.find('img')['data-src']
specs_html = model.find(
class_="phone__section phone__section_widget_quickSpecs")
release_date = specs_html.find(class_="calendar")
release_date = release_date.find(class_="title").p.text
display = specs_html.find(class_="display")
display = display.find(class_="title").p.text
camera = specs_html.find(class_="camera")
camera = camera.find(class_="title").p.text
hardware = specs_html.find(class_="hardware")
hardware = hardware.find(class_="title").p.text
storage = specs_html.find(class_="storage")
storage = storage.find(class_="title").p.text
battery = specs_html.find(class_="battery")
battery = battery.find(class_="title").p.text
os = specs_html.find(class_="os")
os = os.find(class_="title").p.text
phone_data[model_name] = {
"image": model_img,
"release_date": release_date,
"display": display,
"camera": camera,
"hardware": hardware,
"storage": storage,
"battery": battery,
"os": os
}
with open(obj.absolute_path+'-PhoneSpecs.json', 'w+') as of:
json.dump(phone_data, of)
except:
print("Exception happened!")
continue
return phone_data
if __name__ == "__main__":
obj = Phonearena()
try:
# Step 1: Scrape links to all the individual phone specs page and save it so that we don't need to run it again.
phone_urls = obj.crawl_phone_urls()
with open(obj.absolute_path+'-Phoneurls.json', 'w') as of:
json.dump(phone_urls, of)
# Step 2: Iterate through all the links from the above execution and run the next command
with open("obj.absolute_path+'-Phoneurls.json", "r") as inp:
temp = json.load(inp)
phone_specs = obj.crawl_phones_models_specification(temp)
except KeyboardInterrupt:
print("File has been stopped due to KeyBoard Interruption.")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/jack2583/pythonExamples.git
git@gitee.com:jack2583/pythonExamples.git
jack2583
pythonExamples
PythonExamples
master

搜索帮助