Untitled diff

Created Diff never expires
10 removals
Lines
Total
Removed
Words
Total
Removed
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
150 lines
10 additions
Lines
Total
Added
Words
Total
Added
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
150 lines
#note: <meta content='Story' property='bb:resource_type'>
#note: <meta content='Story' property='bb:resource_type'>
import urllib2
import urllib2
import os
import os
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
from urlparse import urljoin
from urlparse import urljoin
class Spider:
class Spider:
links_to_crawl = []
links_to_crawl = []
crawled_links = []
crawled_links = []
ignored_links = ['/']
ignored_links = ['/']
domain = 'http://bloomberg.com/'
domain = 'http://bloomberg.com/'
#meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'
#meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'
# append all starting link to links_to_crawl
# append all starting link to links_to_crawl
def __init__(self, url):
def __init__(self, url):
print 'Spider initialising...'
print 'Spider initialising...'
self.links_to_crawl.append(url)
self.links_to_crawl.append(url)
# open input url and return html
# open input url and return html
def grab_html(self,url):
def grab_html(self,url):
open_url = self.urllib2.urlopen(url)
open_url = urllib2.urlopen(url)
data = open_url.read()
data = open_url.read()
open_url.close()
open_url.close()
return data
return data
# return title from input html for file naming and ensure
# return title from input html for file naming and ensure
# no '/' present in title.
# no '/' present in title.
def get_title(self, data=''):
def get_title(self, data=''):
title_start = data.find('<title>')+7
title_start = data.find('<title>')+7
title_end = data.find('</title>')-1
title_end = data.find('</title>')-1
title = data[title_start:title_end]
title = data[title_start:title_end]
title = title.translate(None, '/')
title = title.translate(None, '/')
return title+".txt"
return title+".txt"
# return date from input html for file saving structure
# return date from input html for file saving structure
def get_date(self, data=''):
def get_date(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
# try statement to avoid error when meta tag combinations
# try statement to avoid error when meta tag combinations
# not found.
# not found.
try:
try:
date = soup.find('meta', {'name':'pubdate'})['content']
date = soup.find('meta', {'name':'pubdate'})['content']
return date[:12] # !! only tested with bloomberg.com !!
return date[:12] # !! only tested with bloomberg.com !!
# if there is no published date, return 'Other'
# if there is no published date, return 'Other'
except TypeError:
except TypeError:
return 'Other'
return 'Other'
# if link is relative url return 'Rel' or
# if link is relative url return 'Rel' or
# if url is allowed domain return 'Abs', else False.
# if url is allowed domain return 'Abs', else False.
def url_type(self,url=''):
def url_type(self,url=''):
if url[0:4] != 'http':
if url[0:4] != 'http':
return 'Rel'
return 'Rel'
elif url.find(self.domain) != -1:
elif url.find(self.domain) != -1:
return 'Abs'
return 'Abs'
else:
else:
return False
return False
# reconstruct relative url
# reconstruct relative url
def reconstruct_url(self, page='', rel=''):
def reconstruct_url(self, page='', rel=''):
print page #debug
print page #debug
print rel #debug
print rel #debug
print self.urljoin(page, rel) #debug
print urljoin(page, rel) #debug
return self.urljoin(page, rel)
return urljoin(page, rel)
# get all links in input html and append to links_to_crawl
# get all links in input html and append to links_to_crawl
# unless in crawled_links or ignored_links
# unless in crawled_links or ignored_links
# if link is relative url reconstruct url and append to
# if link is relative url reconstruct url and append to
# links_to_crawl, append relative url to ignored_links
# links_to_crawl, append relative url to ignored_links
def get_links(self, data=''):
def get_links(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
for link in soup.find_all('a'):
# try statement to avoid error when finding
# try statement to avoid error when finding
# <a> tags withou 'href'
# <a> tags withou 'href'
try:
try:
if link['href'] in self.ignored_links or self.crawled_links:
if link['href'] in self.ignored_links or self.crawled_links:
pass
pass
else:
else:
if self.url_type(link['href'])=='Rel':
if self.url_type(link['href'])=='Rel':
reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
self.ignored_links.append(link['href']) # append original link to ignored_links
self.ignored_links.append(link['href']) # append original link to ignored_links
else:
else:
self.links_to_crawl.append(link['href'])
self.links_to_crawl.append(link['href'])
except KeyError:
except KeyError:
pass
pass
# if directory exists do nothing
# if directory exists do nothing
# if directory does not exist write directory
# if directory does not exist write directory
def ensure_dir(self, directory=''):
def ensure_dir(self, directory=''):
if self.os.path.exists(directory):
if os.path.exists(directory):
pass
pass
else:
else:
self.os.makedirs(directory)
os.makedirs(directory)
# ensure the html being saved is the type requested
# ensure the html being saved is the type requested
# currently only compatible with 1 meta type
# currently only compatible with 1 meta type
def ensure_meta_type(self, data=''):
def ensure_meta_type(self, data=''):
soup = self.BeautifulSoup(data)
soup = BeautifulSoup(data)
try:
try:
soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
print 'True'
print 'True'
return True
return True
except TypeError:
except TypeError:
print 'False'
print 'False'
return False
return False
# save input html to txt file on mac os desktop and return
# save input html to txt file on mac os desktop and return
# absolute path to file
# absolute path to file
def save_html(self,data=''):
def save_html(self,data=''):
if self.ensure_meta_type(data):
if self.ensure_meta_type(data):
print 'SAVING URL'
print 'SAVING URL'
# allocate save path for file and ensure save path exists
# allocate save path for file and ensure save path exists
save_path = self.os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
save_path = os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
self.ensure_dir(save_path)
self.ensure_dir(save_path)
# get file name and write file to absolute path
# get file name and write file to absolute path
file_name = self.get_title(data)
file_name = self.get_title(data)
absolute_path = save_path+'/'+file_name
absolute_path = save_path+'/'+file_name
opened_file = open(absolute_path,'w')
opened_file = open(absolute_path,'w')
opened_file.write(data)
opened_file.write(data)
opened_file.close()
opened_file.close()
else:
else:
pass
pass
# crawl links_to_crawl and pop to crawled_links list
# crawl links_to_crawl and pop to crawled_links list
# if ValueError then pop to ignored_links
# if ValueError then pop to ignored_links
# except urllib2.URLError to avoid web crawler crawling
# except urllib2.URLError to avoid web crawler crawling
# non-url links
# non-url links
def crawl_links(self):
def crawl_links(self):
while len(self.links_to_crawl) > 0:
while len(self.links_to_crawl) > 0:
url = self.links_to_crawl[0]
url = self.links_to_crawl[0]
print url
print url
try:
try:
data = self.grab_html(url)
data = self.grab_html(url)
self.get_links(data)
self.get_links(data)
self.save_html(data)
self.save_html(data)
self.crawled_links.append(self.links_to_crawl.pop(0))
self.crawled_links.append(self.links_to_crawl.pop(0))
except (ValueError, self.urllib2.URLError):
except (ValueError, urllib2.URLError):
self.ignored_links.append(self.links_to_crawl.pop(0))
self.ignored_links.append(self.links_to_crawl.pop(0))
print 'Spider finished.'
print 'Spider finished.'
print 'Ignored links:'
print 'Ignored links:'
print self.ignored_links
print self.ignored_links
print 'Crawled links:'
print 'Crawled links:'
print self.crawled_links
print self.crawled_links
spider = Spider('http://www.bloomberg.com/news')
spider = Spider('http://www.bloomberg.com/news')
spider.crawl_links()
spider.crawl_links()