代码拉取完成,页面将自动刷新
"""collects web page html files from dmoz dataset."""
import sys
PY_VERS = (3, 2)
assert (sys.version_info >= PY_VERS), "Sorry, this program requires Python " \
"{}+, not Python {}.".format('.'.join(map(str, PY_VERS)), sys.version[:5])
#########################################
# global import
#########################################
import os
import argparse
from collections import Counter
import xml.sax
import time
import json
import logging
# from GoogleScraper import scrape_with_config, GoogleSearchError
import concurrent.futures
#########################################
# FLAGS
#########################################
PARSER = argparse.ArgumentParser(description='collect web page html data.')
PARSER.add_argument("--data_dir",
type=str,
default="data/",
help="path of data directory.")
# CAT_FETCH is subset of CATEGORIES, only fetch those cats
PARSER.add_argument("--cat_fetch",
type=str,
default="",
# default="Business,Society,Science,Recreation,Shopping,Games",
# default="Arts,Business,Computers,Health",
help="cats to be print, no space, empty to fetch all")
PARSER.add_argument("--pages_per_file",
type=int,
default=5000,
help="number of web pages per json file")
PARSER.add_argument("--max_file_num",
type=int,
default=2,
help="max num of files per cat, -1 for no limit")
PARSER.add_argument("--max_workers",
type=int,
default=16,
help="number of max threads to fetch web pages")
PARSER.add_argument("--fetch_timeout",
type=int,
default=15,
help="max seconds to fetch a web page")
PARSER.add_argument("--max_neighbor",
type=int,
default=15,
help="max # for every kinds of neighbors")
PARSER.add_argument("--max_child",
type=int,
default=20,
help="max # for children")
PARSER.add_argument("--max_html_length",
type=int,
default=1500,
help="max number of characters of each neighbors html length")
PARSER.add_argument("--dataset_type",
type=str,
default="ukwa",
help="dmoz or ukwa")
PARSER.add_argument("--cat_num",
type=int,
default=10,
help="number of categories to collect: 5 and 10")
FLAGS = PARSER.parse_args()
#########################################
# global variables
#########################################
CUR_TIME = time.strftime("%Y-%m-%d_%H-%M-%S")
if FLAGS.dataset_type == 'dmoz':
if FLAGS.cat_num == 5:
CATEGORIES = list(set(["Arts", "Business", "Computers", "Health",
"Sports"]))
elif FLAGS.cat_num == 10:
CATEGORIES = list(set(["Arts", "Business", "Computers", "Health",
'Society', 'Science', 'Sports', 'Recreation',
'Shopping', 'Games']))
# CATEGORIES = list(set(["Arts", "Business", "Computers", "Health",
# 'Society', 'Science', 'Sports', 'Recreation',
# 'Shopping', 'Reference', 'Games', 'Home']))
else:
raise ValueError("cat_num wrong value: " + FLAGS.cat_num)
DMOZ_JSON = "dmoz_{}.json".format(len(CATEGORIES))
elif FLAGS.dataset_type == 'ukwa':
CATEGORIES = list(set(
['Arts & Humanities', 'Government, Law & Politics',
'Society & Culture', 'Business, Economy & Industry',
'Science & Technology', 'Medicine & Health', 'Education & Research',
'Company Web Sites', 'Digital Society', 'Sports and Recreation']))
DMOZ_JSON = "ukwa_{}.json".format(len(CATEGORIES))
else:
raise ValueError("dataset_type has wrong value: {}".format(
FLAGS.dataset_type))
DMOZ_FILE = "content.rdf.u8.gz"
DMOZ_URL = "http://rdf.dmoz.org/rdf/content.rdf.u8.gz"
UKWA_FILE = 'classification.tsv'
CAT_PREFIX = "Top/"
#########################################
# functions
#########################################
def maybe_download(data_dir, s_name, s_url):
"""download and unpack the source file if not exists.
args:
data_dir (str): paht of directory for storing data.
s_name (str): name of compressed source file.
s_url (str): where to download source file.
return:
str: path of unpacked source file directory.
"""
data_dir = os.path.expanduser(data_dir)
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
logging.info("created data_dir: {}".format(data_dir))
s_packed = os.path.join(data_dir, s_name)
logging.info("source path: " + s_packed)
# split twice for .tar.**
s_dir, s_ext = os.path.splitext(s_packed)
if s_dir.endswith(".tar"):
s_dir, e = os.path.splitext(s_dir)
s_ext = e + s_ext
# always create a new directory for unpacked files
if not os.path.isdir(s_dir):
os.mkdir(s_dir)
logging.info("created source dir: " + s_dir)
if os.listdir(s_dir):
logging.info("file already exists:" + s_dir)
else:
if not os.path.isfile(s_packed):
logging.info("downloading" + s_name + "...")
import urllib.request
import shutil
# download_path should == s_packed
# download_path, _ = urllib.urlretrieve(s_url, s_packed)
with urllib.request.urlopen(s_url) as r, open(s_packed, 'wb') as f:
shutil.copyfileobj(r, f)
logging.info('Successfully downloaded' + s_packed)
logging.info("size: {} bytes.".format(os.path.getsize(s_packed)))
# uppack downloaded source file
logging.info("extracting file:" + s_packed)
if s_ext == ".tar.gz":
import tarfile
with tarfile.open(s_packed, "r:*") as f:
f.extractall(s_dir)
elif s_ext == ".bz2":
# only single file!! need file name
s = os.path.join(s_dir, os.path.basename(s_dir))
import bz2
data = bz2.BZ2File(s_packed).read()
with open(s, "w") as s_unpack:
s_unpack.write(data)
elif s_ext == ".zip":
import zipfile
with zipfile.ZipFile(s_packed, "r") as z:
z.extractall(s_dir)
elif s_ext == ".gz":
# only single file!! need file name
s = os.path.join(s_dir, os.path.basename(s_dir))
import gzip
with gzip.open(s_packed, "rb") as f, open(s, "wb") as s_unpack:
s_unpack.write(f.read())
elif s_ext == "":
logging.info("no file extention")
else:
raise ValueError("unknown compressed file")
logging.info("successfully extracted file:")
return s_dir
def read_json(filename):
if os.path.isfile(filename):
logging.info("reading from json file:" + filename)
with open(filename) as data_file:
data = json.load(data_file)
logging.info("finish reading json file")
return data
else:
raise FileNotFoundError("json file:", filename)
def write_json(filename, data):
logging.info("writing dmoz to" + filename)
with open(filename, 'w') as outfile:
json.dump(data, outfile, indent=4)
logging.info("finish writing to" + filename)
class DmozHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self._current_page = ''
self._capture_content = False
self._current_content = {}
self._expect_end = False
self.count = []
self.dmoz = {}
for cat in CATEGORIES:
self.dmoz.setdefault(cat, [])
def startElement(self, name, attrs):
if name == 'ExternalPage':
self._current_page = attrs['about']
self._current_content = {}
elif name in ['d:Title', 'd:Description', 'priority', 'topic']:
self._capture_content = True
self._capture_content_type = name
def endElement(self, name):
# Make sure that the only thing after "topic" is "/ExternalPage"
if self._expect_end:
assert name == 'topic' or name == 'ExternalPage'
if name == 'ExternalPage':
self._expect_end = False
def characters(self, content):
if self._capture_content:
assert not self._expect_end
self._current_content[self._capture_content_type] = content
# logging.info self._capture_content_type, self._current_content[self._capture_content_type]
# This makes the assumption that "topic" is the last entity in each dmoz page:
if self._capture_content_type == "topic":
# add url to self.dmoz
if content and content.startswith(CAT_PREFIX):
top_cat = content.split('/')[1]
if top_cat:
self.count.append(top_cat)
cat = content[len(CAT_PREFIX):]
for i in range(len(CATEGORIES)):
if cat.startswith(CATEGORIES[i]):
m_id = len(self.dmoz[CATEGORIES[i]])
title = self._current_content.get("d:Title", "")
desc = self._current_content.get("d:Description",
"")
self.dmoz[CATEGORIES[i]].append(
{"id": m_id,
"url": self._current_page,
"title": title,
"desc": desc})
# logging.info("url:"+ self._current_page)
self._expect_end = True
self._capture_content = False
def parse_dmoz(file_path, dmoz_json):
"""parse dmoz xml content file.
args:
file_path (str): path of dmoz file.
return:
dmoz (dict): categories map to web pages.
{category:[url_info{desc/id/url/title:value}]}
count (Counter): # of url in every categories.
"""
import random
logging.info("parsing dmoz xml file... (it may take several minites)")
# create an XMLReader
parser = xml.sax.make_parser()
handler = DmozHandler()
parser.setContentHandler(handler)
parser.parse(file_path)
logging.info("dmoz:{}".format({k: len(handler.dmoz[k])
for k in handler.dmoz.keys()}))
logging.info("count:{}".format(Counter(handler.count)))
logging.info("parsed dmoz xml successfully")
logging.info("shuffling dmoz")
for key in handler.dmoz:
logging.info("shuffling category {}".format(key))
random.shuffle(handler.dmoz[key])
logging.info("reseting id")
for i in range(len(handler.dmoz[key])):
handler.dmoz[key][i]['id'] = i
write_json(dmoz_json, handler.dmoz)
def parse_ukwa(ukwa_tsv, ukwa_json):
"""parse dmoz xml content file.
args:
ukwa_tsv (str): path of tsv file.
return:
dmoz (dict): categories map to web pages.
{category:[url_info{desc/id/url/title:value}]}
count (Counter): # of url in every categories.
"""
import random
import csv
logging.info("parsing ukwa tsv file... (it may take several minites)")
with open(ukwa_tsv) as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
ukwa = {cat: [] for cat in CATEGORIES}
p_cats = []
for row in tsvin:
p_cats.append(row[0])
if row[0] in CATEGORIES:
try:
# print ("row:", row)
url = row[3]
if not url.startswith("http"):
# print("not http: " + url)
url = row[-1]
print("new url:" + url)
ukwa[row[0]].append({"id": 0,
"url": url,
"title": row[2],
"desc": ''})
except Exception:
logging.info("row exception:{}".format(row))
cnt = Counter(p_cats)
logging.info("count:{}".format(Counter(cnt)))
logging.info("parsed ukwa xml successfully")
logging.info("shuffling ukwa")
for key in ukwa:
logging.info("shuffling category {}".format(key))
random.shuffle(ukwa[key])
logging.info("reseting id")
for i in range(len(ukwa[key])):
ukwa[key][i]['id'] = i
write_json(ukwa_json, ukwa)
def chunks_generator(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def page_generator(aa):
"""Yield successive n-sized chunks from l."""
for a in aa:
yield a
def get_child_urls(main_page, max_child=20):
"""retrieve urls from giving html page.
args:
main_page(str): html file.
max_child(int): max number of return urls.
return:
list of url string.
"""
from bs4 import BeautifulSoup, SoupStrainer
children = []
for link in BeautifulSoup(main_page,
"html.parser",
parse_only=SoupStrainer('a')):
if link.has_attr('href') and link['href'].startswith("http"):
children.append(link['href'])
if len(children) > max_child:
children = children[:max_child]
return children
def google_url(op, url, max_num=10):
"""get parent or spousal of url by google sear engine.
args:
op (str): "link" for parent urls or "related" for spousal urls
url (str)
max_num (int): number of return urls
"""
s = op + ':' + url
# See in the config.cfg file for possible values
config = {
'use_own_ip': True,
'keyword': s,
'search_engines': ['google'],
'num_pages_for_keyword': round(max_num / 10),
'scrape_method': 'selenium',
'sel_browser': 'chrome',
'do_caching': False,
'log_level': 50
}
try:
search = scrape_with_config(config)
# let's inspect what we got
urls = []
for serp in search.serps:
for link in serp.links:
urls.append(link.link)
except GoogleSearchError as e:
logging.error("google_url: {}".format(e))
raise
return urls
def google_urls(urls, max_num=10):
"""get parent or spousal of url by google sear engine.
args:
op (str): "link" for parent urls or "related" for spousal urls
url ([url_str, url_str])
max_num (int): number of return urls
"""
# See in the config.cfg file for possible values
config = {
'use_own_ip': True,
'keywords': urls,
'search_engines': ['google'],
'num_pages_for_keyword': round(max_num / 10),
'scrape_method': 'selenium',
'sel_browser': 'chrome',
'do_caching': False,
'log_level': 50,
'num_workers': 8
}
# config = {
# 'use_own_ip': 'True',
# 'keywords': urls,
# 'search_engines': ['google'],
# 'num_pages_for_keyword': round(max_num / 10),
# 'scrape_method': 'http',
# 'do_caching': 'False',
# 'num_workers': 8
# }
try:
logging.debug("a")
search = scrape_with_config(config)
# let's inspect what we got
urls = []
for serp in search.serps:
similar = []
for link in serp.links:
similar.append(link.link)
urls.append(similar)
except GoogleSearchError as e:
logging.error("google_url: {}".format(e))
raise
return urls
def create_nodes(pages, init_id=0):
"""
args:
[[url, html_text]]
init_id: start id number of relatives
"""
nodes = []
count_id = init_id
for page in pages:
node = {}
node["r_id"] = count_id
node["r_url"] = page[0]
node["r_html"] = page[1]
# create_nodes.count += 1
count_id += 1
nodes.append(node)
return nodes
def get_htmls(urls, timeout):
"""get htmls from urls array.
args:
urls([str]): urls string array.
timeout(int): seconds
return:
[url, html]
url: str
html: str
"""
import urllib.request
ret = []
for url in urls:
try:
with urllib.request.urlopen(url, timeout=timeout) as conn:
html = conn.read().decode("utf-8", errors="ignore")
except:
pass
else:
ret.append([url, html])
return ret
def parse_htmls(htmls, max_len=-1, dmoz=None):
"""
args:
[[url, html]]
return:
[[url, text from html]]
"""
from bs4 import BeautifulSoup
import re
new_htmls = []
for html in htmls:
soup = BeautifulSoup(html[1], "html.parser")
new_html = []
if dmoz and len(dmoz) == 2 and dmoz[0] and dmoz[1]:
new_html.append('<core>' + dmoz[0] + '. ' + dmoz[1] + '</core>')
if html[0]:
s = html[0].replace('/', ' ')
s = s.replace('.', ' ')
s = s.replace(':', ' ')
new_html.append('<url>' + s + '<url>')
title = soup.title
if title:
new_html.append('<title>' + title.string.strip() + '<title>')
meta = []
for a in soup.find_all('meta'):
if 'name' in a.attrs and (a['name'] == "description" or
a['name'] == 'keywords'):
meta.append(a['content'].strip())
if meta:
new_html.append('<meta>' + ' '.join(meta) + '<meata>')
heading = []
for a in soup.find_all(re.compile("h[1-5]")):
heading.append(a.get_text().strip())
if heading:
new_html.append('<heading>' + ' '.join(heading) + '<heading>')
paragraph = []
for a in soup.find_all('p'):
paragraph.append(a.get_text().strip())
if paragraph:
new_html.append('<paragraph>' + ' '.join(paragraph) +
'<paragraph>')
tmp = ' '.join(new_html)
if max_len > 0:
if len(tmp) > max_len:
tmp = tmp[:max_len]
new_htmls.append([html[0], tmp])
return new_htmls
def fetch_single_page(url, timeout):
"""Retrieve a single page and report the url and contents"""
import urllib.request
with urllib.request.urlopen(url["url"], timeout=timeout) as conn:
main_page = conn.read().decode("utf-8", errors="ignore")
page = {}
page["id"] = url["id"]
page["url"] = url["url"]
main_pages = parse_htmls(
[[page["url"], main_page]],
FLAGS.max_html_length,
dmoz=[url['title'], url['desc']])
# page["nodes"].extend(create_nodes(main_pages))
page['html'] = main_pages[0][1]
assert isinstance(page['html'], str), "error: main_pages not string"
logging.debug("add main")
# create_nodes.count = 0
# page["edges"] = []
page["relatives"] = []
# get child neighbors
try:
child_urls = get_child_urls(main_page, max_child=FLAGS.max_child)
child_pages = get_htmls(child_urls, timeout)
child_pages = parse_htmls(child_pages, FLAGS.max_html_length)
page["relatives"].extend(create_nodes(child_pages))
logging.debug("add children")
except Exception as e:
logging.error("child: {}".format(e))
# # get parent neighbors
# parent_urls = google_url("link", url["url"], max_num=FLAGS.max_neighbor)
# parent_pages = get_htmls(parent_urls, timeout)
# page["relatives"].extend(create_nodes(parent_pages))
# # get sibling neighbors
# sibling_count = 0
# for sibling in parent_pages:
# sibling_urls = get_child_urls(sibling[1], max_child=FLAGS.max_neighbor)
# sibling_pages = get_htmls(sibling_urls, timeout)
# page["relatives"].extend(create_nodes(sibling_pages))
# sibling_count += len(sibling_pages)
# get similar neighbors
# max_num should be 10 !!
# try:
# if 'similar' in url and url['similar']:
# similar_pages = get_htmls(url['similar'], timeout)
# similar_pages = parse_htmls(similar_pages, 1024)
# page["relatives"].extend(create_nodes(similar_pages))
# logging.debug("add similar")
# except Exception as e:
# logging.error("smilar page: {}".format(e))
try:
logging.debug("\nchild length: {}".format(len(child_pages)))
# print("parent length: {}".format(len(parent_pages)))
# print("sibling length: {}".format(sibling_count))
# logging.debug("similar length: {}".format(len(similar_pages)))
logging.info("url id: {}".format(url['id']))
except Exception as e:
logging.error("print: {}".format(e))
return page
def fetch_chunk_pages(urls, max_workers):
"""
args:
return:
number of valid pages downloaded in this chunk.
"""
# try:
# # get similar pages for entire chunk
# logging.debug("get similar pages")
# tmp = ["related:" + u['url'] for u in urls]
# similar_urls = google_urls(tmp, max_num=10)
# for i in range(len(urls)):
# urls[i]['similar'] = similar_urls[i]
# logging.info("similar for chunk:{}".format(len(similar_urls)))
# except Exception as e:
# logging.error("cannot google similar: {}".format(e))
logging.info('start fetching chunk pages')
pages = []
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(
max_workers=max_workers) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {
executor.submit(fetch_single_page, url, FLAGS.fetch_timeout): url
for url in urls
}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
page = future.result()
except Exception as exc:
logging.error('%r:%r generated an exception: %s' %
(url["id"], url["url"], exc))
else:
# logging.info('%r page is %d bytes' % (url["url"], len(page)))
pages.append(page)
return pages
def fetch_pages(dmoz_json, out_dir):
"""grab html for every url in dmoz, then write to the json file.
Both main and four kinds of relatives html will be collected.
"""
dmoz = read_json(dmoz_json)
keys = FLAGS.cat_fetch.split(',')
if not keys[0]:
keys = dmoz.keys()
for key in keys:
key_dir = os.path.join(out_dir, key)
os.makedirs(key_dir)
logging.info("\n\nwrite json files to folder: {}".format(key_dir))
# urls = page_generator(dmoz[key])
cur_chunk = []
dmoz_head = 0
j_ind = 0
while j_ind < FLAGS.max_file_num and dmoz_head < len(dmoz[key]):
t_start = time.time()
sub = round((FLAGS.pages_per_file - len(cur_chunk)) * 1.5)
while sub > 0 and len(
cur_chunk) < FLAGS.pages_per_file and dmoz_head < len(dmoz[
key]):
logging.info("\n\nj_ind:{}, key:{}, sub:{}".format(j_ind, key,
sub))
dmoz_tail = dmoz_head + sub
if dmoz_tail > len(dmoz[key]):
logging.info("reach the end of dmoz[key]")
dmoz_tail = len(dmoz[key])
sub_chunk = fetch_chunk_pages(dmoz[key][dmoz_head:dmoz_tail],
FLAGS.max_workers)
cur_chunk.extend(sub_chunk)
dmoz_head = dmoz_tail
# write chunk json file
chunk_file = os.path.join(key_dir, str(j_ind) + ".json")
# write chunk of pages json file
logging.info("\n\n\nfinish fetching chunk {}".format(j_ind))
write_json(chunk_file, cur_chunk[:FLAGS.pages_per_file])
cur_chunk = cur_chunk[FLAGS.pages_per_file:]
duration = time.time() - t_start
logging.info("total time used in chunk: {}s".format(duration))
logging.info("average time of fetching one page: {}\n\n".format(
duration / FLAGS.pages_per_file))
j_ind += 1
logging.info("\ntotal valid chunks # for {}: {}".format(key, j_ind))
def set_logging(stream=False, fileh=False, filename="example.log"):
"""set basic logging configurations (root logger).
args:
stream (bool): whether logging.info log to console.
fileh (bool): whether write log to file.
filename (str): the path of log file.
return:
configued root logger.
"""
handlers = []
level = logging.INFO
log_format = '%(asctime)s: %(message)s'
if stream:
handlers.append(logging.StreamHandler())
if fileh:
handlers.append(logging.FileHandler(filename))
logging.basicConfig(format=log_format, handlers=handlers, level=level)
return logging.getLogger()
def main():
print("start of main\n")
html_dir = os.path.join(FLAGS.data_dir, "html_" + CUR_TIME)
os.makedirs(html_dir)
log_file = os.path.join(html_dir, "log")
set_logging(stream=True, fileh=True, filename=log_file)
logging.info("\nall arguments:")
logging.info("CATEGORIES: {}".format(', '.join(CATEGORIES)))
for arg in vars(FLAGS):
logging.info("{:12}{}".format(arg, getattr(FLAGS, arg)))
# download dmoz content file
dmoz_path = maybe_download(FLAGS.data_dir, DMOZ_FILE, DMOZ_URL)
dmoz_cont = os.path.join(dmoz_path, os.path.splitext(DMOZ_FILE)[0])
if not os.path.isfile(dmoz_cont):
raise FileNotFoundError("dmoz_cont")
dmoz_json = os.path.join(FLAGS.data_dir, DMOZ_JSON)
if os.path.isfile(dmoz_json):
logging.info("file already exists, won't parse dmoz xml: {}".format(
dmoz_json))
else:
if FLAGS.dataset_type == 'dmoz':
# parse dmoz xml, and write to dmoz_json
# json format: [url, url]
# url: {"id":int, "url":str, "title":str, "desc":str}
parse_dmoz(dmoz_cont, dmoz_json)
else:
ukwa_tsv = os.path.join(FLAGS.data_dir, UKWA_FILE)
parse_ukwa(ukwa_tsv, dmoz_json)
# grab main and relatives html text, write to disc
# json format: [page, page]
# page: {"id":int, "url":str, "edges":[edge, edge], "nodes":[node, node]}
# the first node is the target page
# edge: [n_id, n_id]
# node: {"n_id":int, "n_url":str, "html":str, ???}
# second edition:
# json format: [page, page]
# page: {"id":int, "url":str, "html":str, "relatives":[relative, relative]}
# the first node is the target page
# relative: {"r_id":int, "r_url":str, "r_html":str}
fetch_pages(dmoz_json, html_dir)
logging.info("\nend of main")
if __name__ == "__main__":
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。