RubyでSEIYUの商品情報をほぼ全取得、CSVに出力。

RubyでSEIYUの商品情報をほぼ全件取得、CSVに出力するプログラムを書きました。

商品件数は1万4000件ほどあります。

カラムは商品名、値段(￥)、商品画像URL、カテゴリ3つです。

主に使ったGemはAnemoneとNokogiriです。

自分用ビボウロクなのでインデントズレズレ、無駄なコメントアウト多くてすいません。無視して下さい、、、、、

require 'rubygems'
#require 'string-scrub'
require 'nokogiri'
require 'kconv'
require "open-uri"
require "anemone"
require 'csv'


def category_url_get
 $category = []
 $category_urls = []
 base = "https://www.the-seiyu.com/front/contents/top/ns/"
  Anemone.crawl(base, :depth_limit => 0 , :delay => 3) do |anemone|
   anemone.on_every_page do |page|
    doc = Nokogiri::HTML.parse(page.body.force_encoding("UTF-8")) 
  
      category_array = []
      doc.xpath("//*[@id='categoryListWrapper_0002']/div[2]").each do |node|
 #p         category_name_level_2 = node_2.inner_text.gsub!(/(\s)/,"")
   node.css("a.level_3_li_Inner").each do |node_3|
 #puts  "---------------------------------------------------"
 #category3
   category_3 = node_3.inner_text.gsub!(/(\s)/,"")
   category_url_3 = "https://www.the-seiyu.com" + node_3 ["href"]
 #category2
  category_2 = node_3.parent.parent.parent.css("a.level_2_li_Inner").inner_text.gsub!(/(\s)/,"")
 #category1
  category_1 = node_3.parent.parent.parent.parent.parent.css("div.level_1_li_Inner").xpath("./a/span").inner_text
 puts " ----------------------------------------------"                                                              
                  category_array << category_1
                  category_array << category_2
                  category_array << category_3
#                category_array << category_url_3
                 $category_urls << category_url_3
                 $category << category_array
                 category_array = []
            end
         end
      end
     end
#    puts "-------------------"
#p  $category_urls 
#p  $category
# puts "------------------------"
  end  

#  $category_urls = $category_urls.reject!{|elem| elem =~ /front\/app\/catalog\/select_commodity_list\/\?searchSelectParentCode=89/}
#category_url_get

#$other_urls = []
#def other_category_url
#  $other_urls = $category_urls.map {|url|  url + "&mode=image&pageSize=49&currentPage=#{p+1}&alignmentSequence=1&resultMessage="}
#end

def scrape
   #  $urls << "https://www.the-seiyu.com/front/app/catalog/list/init?searchCategoryCode=200013&nsFlg=true&wrt=y&parent1Code=200001&parent2Code=200013"
   # $urls << "https://www.the-seiyu.com/front/app/catalog/list/init?searchCategoryCode=200065&nsFlg=true&wrt=y&parent1Code=200007&parent2Code=200065"
   # $category_urls  = []
   #  $category = []
   # $category_urls << ["https://www.the-seiyu.com/front/app/catalog/list/init?searchCategoryCode=100870&nsFlg=true&wrt=y&parent1Code=200008&parent2Code=200071","https://www.the-seiyu.com/front/app/catalog/list/init?searchCategoryCode=200126&nsFlg=true&wrt=y&parent1Code=200002&parent2Code=200022"] 
   # $category <<  [["ペット・園芸用品", "園芸用品", "鉢・プランター"], ["ペット・園芸用品", "園芸用品", "園芸小物ほか"]]
 
    $urls = []
    $stuff=[]
    $result = []

    $category_urls.flatten! 
    $category.flatten!(1)

    n = 0
    $category_urls.zip($category).each do |category_url,category|
    p category_url
    p category
    puts "-----------------------"
    $urls = []
    $urls = Marshal.load(Marshal.dump(category_url))
#  $result = category.dup

# $urls.each do |url|
#      $url = []
#      $url << url
       Anemone.crawl($urls, :depth_limit => 0 , :delay => 3) do |anemone|
        flug = 0
        anemone.on_every_page do |page|
          doc = Nokogiri::HTML.parse(page.body.force_encoding("UTF-8")) 
          doc.xpath("//li[@class='jsFlatHeight_list' and 'resized']").each do |node|
          p node
          #商品名
            stuff=[]
            result = []
            stuff << node.xpath(".//img").attribute("title").value
          #値段
            stuff << node.xpath(".//div/div/span/strong").inner_text
          #画像URL
            stuff << node.xpath(".//img").attribute("data-original").value
            $stuff << stuff
            result = Marshal.load(Marshal.dump(category))
            result << $stuff
            $stuff = []
            result.flatten!
            $result << result
          end
puts "-------------------------"
          p $result
#最後の数字(１ページの為の例外処理)
          begin
            n = doc.xpath("//*[@id='list']/div/ul[2]/li[last()]/a").attribute("href").value.scan(/javascript:move\('(\d+)',/).flatten[0]
            n =- 1
           if flug == 0
              (1..n).each do |p|
                $urls << category_url + "&mode=image&pageSize=49&currentPage=#{p+1}&alignmentSequence=1&resultMessage="  
              end
            end
            flug =+ 1
          rescue 
           next
         end
        end
       end
#p   $stuff
#     end
      n =+1
    end
#p    $result
  end
scrape

def to_csv
  header = ["カテゴリ1","カテゴリ2","カテゴリ3","商品名","値段(￥)","画像URL"]  
  CSV.open('seiyu.csv','w',:encoding => "Windows-31J",:headers => true) do |file|  
    file << header
    $result.each do |line|
      file << line
    end
  end
 puts "--------------------------------------------------------------------------"
end
to_csv

サーバー負荷を考えて必ずdelayを長くとって下さい。

自己責任でお願いします。なにか不都合があれば即削除します。

質問も受け付けていますので気軽にメンションとばして下さい。

ではでは〜。

たれみみ

@taremimi_7