Compare text

Find the difference between two text files

Real-time diff

Unified diff

Collapse lines

Highlight change

Syntax highlighting

Tools

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

Untitled diff

Created 10 years agoDiff never expires

Lines
Total
Removed

Words
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

150 lines

Lines
Total
Added

Words
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

#note: <meta content='Story' property='bb:resource_type'>

import urllib2

import os

from bs4 import BeautifulSoup

from urlparse import urljoin

class Spider:

links_to_crawl = []

crawled_links = []

ignored_links = ['/']

domain = 'http://bloomberg.com/'

#meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'

# append all starting link to links_to_crawl

def __init__(self, url):

print 'Spider initialising...'

self.links_to_crawl.append(url)

# open input url and return html

def grab_html(self,url):

open_url = self.urllib2.urlopen(url)

open_url = urllib2.urlopen(url)

data = open_url.read()

open_url.close()

return data

# return title from input html for file naming and ensure

# no '/' present in title.

def get_title(self, data=''):

title_start = data.find('<title>')+7

title_end = data.find('</title>')-1

title = data[title_start:title_end]

title = title.translate(None, '/')

return title+".txt"

# return date from input html for file saving structure

def get_date(self, data=''):

soup = self.BeautifulSoup(data)

soup = BeautifulSoup(data)

# try statement to avoid error when meta tag combinations

# not found.

try:

date = soup.find('meta', {'name':'pubdate'})['content']

return date[:12] # !! only tested with bloomberg.com !!

# if there is no published date, return 'Other'

except TypeError:

return 'Other'

# if link is relative url return 'Rel' or

# if url is allowed domain return 'Abs', else False.

def url_type(self,url=''):

if url[0:4] != 'http':

return 'Rel'

elif url.find(self.domain) != -1:

return 'Abs'

else:

return False

# reconstruct relative url

def reconstruct_url(self, page='', rel=''):

print page #debug

print rel #debug

print self.urljoin(page, rel) #debug

print urljoin(page, rel) #debug

return self.urljoin(page, rel)

return urljoin(page, rel)

# get all links in input html and append to links_to_crawl

# unless in crawled_links or ignored_links

# if link is relative url reconstruct url and append to

# links_to_crawl, append relative url to ignored_links

def get_links(self, data=''):

soup = self.BeautifulSoup(data)

soup = BeautifulSoup(data)

for link in soup.find_all('a'):

# try statement to avoid error when finding

# <a> tags withou 'href'

try:

if link['href'] in self.ignored_links or self.crawled_links:

pass

else:

if self.url_type(link['href'])=='Rel':

reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!

self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl

self.ignored_links.append(link['href']) # append original link to ignored_links

else:

self.links_to_crawl.append(link['href'])

except KeyError:

pass

# if directory exists do nothing

# if directory does not exist write directory

def ensure_dir(self, directory=''):

if self.os.path.exists(directory):

if os.path.exists(directory):

pass

else:

self.os.makedirs(directory)

os.makedirs(directory)

# ensure the html being saved is the type requested

# currently only compatible with 1 meta type

def ensure_meta_type(self, data=''):

soup = self.BeautifulSoup(data)

soup = BeautifulSoup(data)

try:

soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'

print 'True'

return True

except TypeError:

print 'False'

return False

# save input html to txt file on mac os desktop and return

# absolute path to file

def save_html(self,data=''):

if self.ensure_meta_type(data):

print 'SAVING URL'

# allocate save path for file and ensure save path exists

save_path = self.os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))

save_path = os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))

self.ensure_dir(save_path)

# get file name and write file to absolute path

file_name = self.get_title(data)

absolute_path = save_path+'/'+file_name

opened_file = open(absolute_path,'w')

opened_file.write(data)

opened_file.close()

else:

pass

# crawl links_to_crawl and pop to crawled_links list

# if ValueError then pop to ignored_links

# except urllib2.URLError to avoid web crawler crawling

# non-url links

def crawl_links(self):

while len(self.links_to_crawl) > 0:

url = self.links_to_crawl[0]

print url

try:

data = self.grab_html(url)

self.get_links(data)

self.save_html(data)

self.crawled_links.append(self.links_to_crawl.pop(0))

except (ValueError, self.urllib2.URLError):

except (ValueError, urllib2.URLError):

self.ignored_links.append(self.links_to_crawl.pop(0))

print 'Spider finished.'

print 'Ignored links:'

print self.ignored_links

print 'Crawled links:'

print self.crawled_links

spider = Spider('http://www.bloomberg.com/news')

spider.crawl_links()

Saved diffs

Original text

Open file

#note: <meta content='Story' property='bb:resource_type'>

import urllib2
import os
from bs4 import BeautifulSoup
from urlparse import urljoin

class Spider:

links_to_crawl = []
    crawled_links = []
    ignored_links = ['/']
    domain = 'http://bloomberg.com/'
    #meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'

# append all starting link to links_to_crawl
    def __init__(self, url):
        print 'Spider initialising...'
        self.links_to_crawl.append(url)

# open input url and return html
    def grab_html(self,url):
        open_url = self.urllib2.urlopen(url)
        data = open_url.read()
        open_url.close()
        return data

# return title from input html for file naming and ensure
    # no '/' present in title.
    def get_title(self, data=''):
        title_start = data.find('<title>')+7
        title_end = data.find('</title>')-1
        title = data[title_start:title_end]
        title = title.translate(None, '/')
        return title+".txt"

# return date from input html for file saving structure
    def get_date(self, data=''):
        soup = self.BeautifulSoup(data)
        # try statement to avoid error when meta tag combinations
        # not found.
        try:
            date = soup.find('meta', {'name':'pubdate'})['content']
            return date[:12] # !! only tested with bloomberg.com !!
        # if there is no published date, return 'Other'
        except TypeError:
            return 'Other'

# if link is relative url return 'Rel' or 
    # if url is allowed domain return 'Abs', else False.
    def url_type(self,url=''):
        if url[0:4] != 'http':
            return 'Rel'
        elif url.find(self.domain) != -1:
            return 'Abs'
        else:
            return False

# reconstruct relative url
    def reconstruct_url(self, page='', rel=''):
        print page #debug
        print rel #debug
        print self.urljoin(page, rel) #debug
        return self.urljoin(page, rel)

# get all links in input html and append to links_to_crawl
    # unless in crawled_links or ignored_links
    # if link is relative url reconstruct url and append to 
    # links_to_crawl, append relative url to ignored_links
    def get_links(self, data=''):
        soup = self.BeautifulSoup(data) 
        for link in soup.find_all('a'):
            # try statement to avoid error when finding
            # <a> tags withou 'href'
            try:
                if link['href'] in self.ignored_links or self.crawled_links:
                    pass
                else:
                    if self.url_type(link['href'])=='Rel':
                        reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
                        self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
                        self.ignored_links.append(link['href']) # append original link to ignored_links
                    else:
                        self.links_to_crawl.append(link['href'])
            except KeyError:
                pass

# if directory exists do nothing
    # if directory does not exist write directory
    def ensure_dir(self, directory=''):
        if self.os.path.exists(directory):
            pass
        else:
            self.os.makedirs(directory)

# ensure the html being saved is the type requested
    # currently only compatible with 1 meta type
    def ensure_meta_type(self, data=''):
        soup = self.BeautifulSoup(data)
        try:
            soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
            print 'True'
            return True
        except TypeError:
            print 'False'
            return False

# save input html to txt file on mac os desktop and return
    # absolute path to file
    def save_html(self,data=''):
        if self.ensure_meta_type(data):
            print 'SAVING URL'
            # allocate save path for file and ensure save path exists
            save_path = self.os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
            self.ensure_dir(save_path)
            # get file name and write file to absolute path
            file_name = self.get_title(data)
            absolute_path = save_path+'/'+file_name
            opened_file = open(absolute_path,'w')
            opened_file.write(data)
            opened_file.close()
        else:
            pass

# crawl links_to_crawl and pop to crawled_links list
    # if ValueError then pop to ignored_links
    # except urllib2.URLError to avoid web crawler crawling
    # non-url links  
    def crawl_links(self):
        while len(self.links_to_crawl) > 0:
            url = self.links_to_crawl[0]
            print url
            try:
                data = self.grab_html(url)
                self.get_links(data)
                self.save_html(data)
                self.crawled_links.append(self.links_to_crawl.pop(0))
            except (ValueError, self.urllib2.URLError):
                self.ignored_links.append(self.links_to_crawl.pop(0))
        print 'Spider finished.'
        print 'Ignored links:'
        print self.ignored_links
        print 'Crawled links:'
        print self.crawled_links

spider = Spider('http://www.bloomberg.com/news')
spider.crawl_links()

Changed text

Open file

#note: <meta content='Story' property='bb:resource_type'>

import urllib2
import os
from bs4 import BeautifulSoup
from urlparse import urljoin

class Spider:

links_to_crawl = []
    crawled_links = []
    ignored_links = ['/']
    domain = 'http://bloomberg.com/'
    #meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'

# append all starting link to links_to_crawl
    def __init__(self, url):
        print 'Spider initialising...'
        self.links_to_crawl.append(url)

# open input url and return html
    def grab_html(self,url):
        open_url = urllib2.urlopen(url)
        data = open_url.read()
        open_url.close()
        return data

# return date from input html for file saving structure
    def get_date(self, data=''):
        soup = BeautifulSoup(data)
        # try statement to avoid error when meta tag combinations
        # not found.
        try:
            date = soup.find('meta', {'name':'pubdate'})['content']
            return date[:12] # !! only tested with bloomberg.com !!
        # if there is no published date, return 'Other'
        except TypeError:
            return 'Other'

# reconstruct relative url
    def reconstruct_url(self, page='', rel=''):
        print page #debug
        print rel #debug
        print urljoin(page, rel) #debug
        return urljoin(page, rel)

# get all links in input html and append to links_to_crawl
    # unless in crawled_links or ignored_links
    # if link is relative url reconstruct url and append to 
    # links_to_crawl, append relative url to ignored_links
    def get_links(self, data=''):
        soup = BeautifulSoup(data)
        for link in soup.find_all('a'):
            # try statement to avoid error when finding
            # <a> tags withou 'href'
            try:
                if link['href'] in self.ignored_links or self.crawled_links:
                    pass
                else:
                    if self.url_type(link['href'])=='Rel':
                        reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
                        self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
                        self.ignored_links.append(link['href']) # append original link to ignored_links
                    else:
                        self.links_to_crawl.append(link['href'])
            except KeyError:
                pass

# if directory exists do nothing
    # if directory does not exist write directory
    def ensure_dir(self, directory=''):
        if os.path.exists(directory):
            pass
        else:
            os.makedirs(directory)

# ensure the html being saved is the type requested
    # currently only compatible with 1 meta type
    def ensure_meta_type(self, data=''):
        soup = BeautifulSoup(data)
        try:
            soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
            print 'True'
            return True
        except TypeError:
            print 'False'
            return False

# save input html to txt file on mac os desktop and return
    # absolute path to file
    def save_html(self,data=''):
        if self.ensure_meta_type(data):
            print 'SAVING URL'
            # allocate save path for file and ensure save path exists
            save_path = os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
            self.ensure_dir(save_path)
            # get file name and write file to absolute path
            file_name = self.get_title(data)
            absolute_path = save_path+'/'+file_name
            opened_file = open(absolute_path,'w')
            opened_file.write(data)
            opened_file.close()
        else:
            pass

# crawl links_to_crawl and pop to crawled_links list
    # if ValueError then pop to ignored_links
    # except urllib2.URLError to avoid web crawler crawling
    # non-url links  
    def crawl_links(self):
        while len(self.links_to_crawl) > 0:
            url = self.links_to_crawl[0]
            print url
            try:
                data = self.grab_html(url)
                self.get_links(data)
                self.save_html(data)
                self.crawled_links.append(self.links_to_crawl.pop(0))
            except (ValueError, urllib2.URLError):
                self.ignored_links.append(self.links_to_crawl.pop(0))
        print 'Spider finished.'
        print 'Ignored links:'
        print self.ignored_links
        print 'Crawled links:'
        print self.crawled_links

spider = Spider('http://www.bloomberg.com/news')
spider.crawl_links()