怎么用dw做响应式网站,资质做网站需要用到什么,网校系统搭建,项目建设对企业的意义2019独角兽企业重金招聘Python工程师标准 最近在看《Web Scraping with Python》#xff0c;借此来熟悉Python2.7如何开始编程。 发现书上主要使用的 http://example.webscraping.com/ 网站有部分变化#xff0c;书中的代码有点无法对照使用#xff0c;因此稍… 2019独角兽企业重金招聘Python工程师标准 最近在看《Web Scraping with Python》借此来熟悉Python2.7如何开始编程。 发现书上主要使用的 http://example.webscraping.com/ 网站有部分变化书中的代码有点无法对照使用因此稍微调了一下。 主要功能是下载站上网页然后抓取想要采集的数据内容保存到csv文件中。 需要提前安装第三方库——lxml。 具体代码在下面。 link_crawler.py #!/usr/bin/env python
# -*- coding: utf-8 -*-import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparserdef link_crawler(seed_url, link_regexNone, delay5, max_depth-1, max_urls-1, headersNone, user_agentwswp, proxyNone, num_retries1, scrape_callbackNone):Crawl from the given seed URL following links matched by link_regex# the queue of URLs that still need to be crawledcrawl_queue [seed_url]# the URLs that have been seen and at what depthseen {seed_url: 0}# track how many URLs have been downloadednum_urls 0# http://example.webscraping.com已经找不到robots.txt, rp失效rp get_robots(seed_url)throttle Throttle(delay)headers headers or {}if user_agent:headers[User-agent] user_agentwhile crawl_queue:url crawl_queue.pop()depth seen[url]# check url passes robots.txt restrictions# rp没有读到robots.txt的内容则没有不可fetchif rp.can_fetch(user_agent, url):throttle.wait(url)html download(url, headers, proxyproxy, num_retriesnum_retries)links []if scrape_callback:links.extend(scrape_callback(url, html) or [])if depth ! max_depth:# can still crawl furtherif link_regex:# filter for links matching our regular expressionlinks.extend(link for link in get_links(html) if re.match(link_regex, link))for link in links:link normalize(seed_url, link)# check whether already crawled this linkif link not in seen:seen[link] depth 1# check link is within same domainif same_domain(seed_url, link):# success! add this new link to queuecrawl_queue.append(link)# check whether have reached downloaded maximumnum_urls 1if num_urls max_urls:breakelse:print Blocked by robots.txt:, urlclass Throttle:Throttle downloading by sleeping between requests to same domaindef __init__(self, delay):# amount of delay between downloads for each domainself.delay delay# timestamp of when a domain was last accessedself.domains {}def wait(self, url):Delay if have accessed this domain recentlydomain urlparse.urlsplit(url).netloclast_accessed self.domains.get(domain)if self.delay 0 and last_accessed is not None:sleep_secs self.delay - (datetime.now() - last_accessed).secondsif sleep_secs 0:time.sleep(sleep_secs)self.domains[domain] datetime.now()def download(url, headers, proxy, num_retries, dataNone):print Downloading:, urlrequest urllib2.Request(url, data, headers)opener urllib2.build_opener()if proxy:proxy_params {urlparse.urlparse(url).scheme: proxy}opener.add_handler(urllib2.ProxyHandler(proxy_params))try:response opener.open(request)html response.read()code response.codeexcept urllib2.URLError as e:print Download error:, e.reasonhtml if hasattr(e, code):code e.codeif num_retries 0 and 500 code 600:# retry 5XX HTTP errorshtml download(url, headers, proxy, num_retries-1, data)else:code Nonereturn htmldef normalize(seed_url, link):Normalize this URL by removing hash and adding domainlink, _ urlparse.urldefrag(link) # remove hash to avoid duplicatesreturn urlparse.urljoin(seed_url, link)def same_domain(url1, url2):Return True if both URLs belong to same domainreturn urlparse.urlparse(url1).netloc urlparse.urlparse(url2).netlocdef get_robots(url):Initialize robots parser for this domainrp robotparser.RobotFileParser()rp.set_url(urlparse.urljoin(url, /robots.txt))rp.read()return rpdef get_links(html):Return a list of links from html # a regular expression to extract all links from the webpagewebpage_regex re.compile(a[^]href[\](.*?)[\], re.IGNORECASE)# list of all links from the webpagereturn webpage_regex.findall(html)if __name__ __main__:link_crawler(http://example.webscraping.com, [^\?]*/(index|view), max_depth5, num_retries1)scrape_callback.py #!/usr/bin/env python
# -*- coding: utf-8 -*-import csv
import re
import urlparse
import lxml.html
from link_crawler import link_crawlerclass ScrapeCallback:def __init__(self):self.writer csv.writer(open(countries.csv, w))self.fields (area, population, iso, country, capital, continent, tld, currency_code, currency_name, phone, postal_code_format, postal_code_regex, languages, neighbours)self.writer.writerow(self.fields)def __call__(self, url, html):if re.search(/view/, url):tree lxml.html.fromstring(html)row []for field in self.fields:row.append(tree.cssselect(table tr#places_{}__row td.w2p_fw.format(field))[0].text_content())self.writer.writerow(row)if __name__ __main__:link_crawler(http://example.webscraping.com/, [^\?]*/(index|view), max_depth5, scrape_callbackScrapeCallback())转载于:https://my.oschina.net/elleneye/blog/1615795