scrapy Connecting scrapy to MySQL Connecting and bulk-inserting to MySQL in Scrapy using MySQLDB module - Python 2.7


Example

This example demonstrate how to dynamically insert data into MySQL using Python Scrapy.

You do not need to edit pipelines.py file for any project.

This example can be used for all your project.

Just yield you_data_dictionary from your Spider and inside pipelines.py a query will be created automatically.

Rows are inserted in bulk using bulk insert statement.

MUST READ:

  1. Keys of you you_data_dictionary that you are yielding from Spider must be same as your column names of database table.
  2. Table must be created before you run your code.
  3. Notice if len(self.items) >= 50 line, you can change 50 to any integer.

settings.py

DB_CREDS = {
    'host':'localhost',
    'user':'root',
    'pass':'password',
    'db':'db_name'
}

your_project_folder/spiders/spider_file.py

from scrapy.utils.project import get_project_settings
        def __init__(self, *args, **kwargs):
                self.connectDB()

        def connectDB(self):
                
                self.conn = MySQLdb.connect(user=DB_CREDS['user'], passwd=DB_CREDS['pass'], db=DB_CREDS['db'], host=DB_CREDS['host'], charset="utf8", use_unicode=True)

                self.cursor = MySQLdb.cursors.DictCursor(self.conn) 

                self.conn.autocommit(True)

your_project_folder/pipelines.py

# -*- coding: utf-8 -*-
import logging
from scrapy import signals

class MyPipeline(object):

    def __init__(self):
        self.items=[]

    def process_item(self, item, spider):
            
                self.placeholders = ', '.join(['%s'] * len(item))
                self.columns = ', '.join(item.keys())
                self.query = "INSERT INTO %s ( %s ) VALUES ( %s )" % ("table_name", self.columns, self.placeholders)
                
        self.items.extend([item.values()])
        
        if len(self.items) >= 50:

            try:
                spider.cursor.executemany(self.query, self.items)
                self.items = []    
            except Exception as e:
                if 'MySQL server has gone away' in str(e):
                    spider.connectDB()
                    spider.cursor.executemany(self.query, self.items)
                    self.items = []    
                    else:   
                        raise e
        return item



        def close_spider(self, spider):
            try:
                    spider.cursor.executemany(self.query, self.items)
                    self.items = []    
            except Exception as e:
                    if 'MySQL server has gone away' in str(e):
                            spider.connectDB()
                            spider.cursor.executemany(self.query, self.items)
                            self.items = []    
                    else:   
                            raise e