This example demonstrate how to dynamically insert data into MySQL using Python Scrapy.
You do not need to edit pipelines.py
file for any project.
This example can be used for all your project.
Just yield you_data_dictionary
from your Spider
and inside pipelines.py
a query will be created automatically.
Rows are inserted in bulk using bulk insert statement.
MUST READ:
you_data_dictionary
that you are yield
ing from Spider must be same as your column names of database table.if len(self.items) >= 50
line, you can change 50
to any integer.settings.py
DB_CREDS = {
'host':'localhost',
'user':'root',
'pass':'password',
'db':'db_name'
}
your_project_folder/spiders/spider_file.py
from scrapy.utils.project import get_project_settings
def __init__(self, *args, **kwargs):
self.connectDB()
def connectDB(self):
self.conn = MySQLdb.connect(user=DB_CREDS['user'], passwd=DB_CREDS['pass'], db=DB_CREDS['db'], host=DB_CREDS['host'], charset="utf8", use_unicode=True)
self.cursor = MySQLdb.cursors.DictCursor(self.conn)
self.conn.autocommit(True)
your_project_folder/pipelines.py
# -*- coding: utf-8 -*-
import logging
from scrapy import signals
class MyPipeline(object):
def __init__(self):
self.items=[]
def process_item(self, item, spider):
self.placeholders = ', '.join(['%s'] * len(item))
self.columns = ', '.join(item.keys())
self.query = "INSERT INTO %s ( %s ) VALUES ( %s )" % ("table_name", self.columns, self.placeholders)
self.items.extend([item.values()])
if len(self.items) >= 50:
try:
spider.cursor.executemany(self.query, self.items)
self.items = []
except Exception as e:
if 'MySQL server has gone away' in str(e):
spider.connectDB()
spider.cursor.executemany(self.query, self.items)
self.items = []
else:
raise e
return item
def close_spider(self, spider):
try:
spider.cursor.executemany(self.query, self.items)
self.items = []
except Exception as e:
if 'MySQL server has gone away' in str(e):
spider.connectDB()
spider.cursor.executemany(self.query, self.items)
self.items = []
else:
raise e