$ scrapy startproject amazon
2. 파이썬 설정파일 설정
$cd amazon/amazon
$ nano items.py 파일을 아래와 같이 설정
---------------------
import scrapy
class AmazonItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
desc = scrapy.Field(serializer=str)
pass
-----------------------
3. 아래파일 추가 설정
$ nano spiders/__init__.py
import scrapy
from amazoncrawler.items import AmazoncrawlerItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from amazoncrawler.items import AmazoncrawlerItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class AmazonAllDepartmentSpider(scrapy.Spider):
name = “amazon”
allowed_domains = [“amazon.com”]
start_urls = [
“http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414”
]
def parse(self, response):
for sel in response.xpath(‘//ul/li’):
item = AmazoncrawlerItem()
item[‘title’] = sel.xpath(‘a/text()’).extract()
item[‘link’] = sel.xpath(‘a/@href’).extract()
item[‘desc’] = sel.xpath(‘text()’).extract()
yield item
name = “amazon”
allowed_domains = [“amazon.com”]
start_urls = [
“http://www.amazon.com/gp/site-directory/ref=nav_sad/187-3757581-3331414”
]
def parse(self, response):
for sel in response.xpath(‘//ul/li’):
item = AmazoncrawlerItem()
item[‘title’] = sel.xpath(‘a/text()’).extract()
item[‘link’] = sel.xpath(‘a/@href’).extract()
item[‘desc’] = sel.xpath(‘text()’).extract()
yield item
4. 데이터 크롤링 및 저장
$scrapy crawl amazon -o items.csv -t csv
5. mysql database 만들기
root로 로그인해서
mysql> grant all on qwerty_project.* to dbuser@'localhost';
mysql> use qwerty_project;
mysql> create table ProductDepartment (ProductDepartmentLilnk TEXT);
6. pipelines.py 파일 설정
$nano amazon/amazon/pipelines.py
class AmazoncrawlerPipeline(object):
host = ‘qwerty.com’
user = ‘qwerty’
password = ‘qwerty123’
db = ‘amazon_project’
host = ‘qwerty.com’
user = ‘qwerty’
password = ‘qwerty123’
db = ‘amazon_project’
def __init__(self):
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
self.connection = MySQLdb.connect(self.host, self.user, self.password, self.db)
self.cursor = self.connection.cursor()
def process_item(self, item, spider):
try:
self.cursor.execute(“””INSERT INTO amazon_project.ProductDepartment (ProductDepartmentLilnk)
VALUES (%s)”””,
(
item[‘link’]))
try:
self.cursor.execute(“””INSERT INTO amazon_project.ProductDepartment (ProductDepartmentLilnk)
VALUES (%s)”””,
(
item[‘link’]))
self.connection.commit()
except MySQLdb.Error, e:
print “Error %d: %s” % (e.args[0], e.args[1])
return item
print “Error %d: %s” % (e.args[0], e.args[1])
return item
7. 파일 settings.py 설정 추가
$nano amazon/amazon/ settings.py
ITEM_PIPELINES = [‘amazoncrawler.pipelines.AmazoncrawlerPipeline’]
8. project 실행
$scrapy crawl amazon
댓글 없음:
댓글 쓰기