Pixivのタイトル、タグ検索から一定数以上のブックマークがある画像だけを保存するスクリプト2

ページ取得処理を明確に分離しました。

  • 構成物
    • pixiv.rb
    • main.rb
    • config.yaml

pixiv.rb

#! ruby -Ku
require 'net/http'
require 'uri'
require 'kconv'

class AccessPixiv
  attr_reader :total_images, :total_pages

  #InitializeでCookieを取得しHeaderも構築する
  #成功:Trueを返す
  #失敗:Falseを返す
  def initialize(pixiv_id, pixiv_pass, user_agent, referer)
    get_cookie(pixiv_id, pixiv_pass, user_agent, referer)
  end

  #uriで指定されたページ内容を取得する
  #成功:対象URIの内容を返す
  #失敗:Falseを返す
  def get_file(uri)
    sleep 2 * (rand(5) + 1)
    puts "get #{uri}"
    site = URI.parse(uri)
    Net::HTTP.start(site.host, 80) do |http|
      response = http.get(site.request_uri, @header)
      if disp_error(response) == true
        return response.body
      else
        return false
      end
    end
  end

  #word     : 検索語
  #s_mode   : 検索モード
  #page_num : 取得するページ
  def search(word, s_mode, page_num)
    page = get_file("http://www.pixiv.net/search.php?word=#{URI.encode(word)}&s_mode=#{s_mode}&p=#{page_num}")
    /検索結果:(\d+)/ =~ page
    @total_images = $1.to_i
    @total_pages  = @total_images / 20 + 1
    return page
  end

  #タグ検索
  def search_tag(word, page_num=1)
    search(word, 's_tag', page_num)
  end

  #タイトル・キャプション検索
  def search_title(word, page_num=1)
    search(word, 's_tc', page_num)
  end

  #ファイル保存
  def save_file(data, filename)
    puts "save #{filename}"
    open(filename, 'wb') do |f|
      f.puts data
    end
  end

  #ディレクトリ作成
  def make_dir(name)
    if /mswin(?!ce)|mingw|cygwin|bccwin/ =~ RUBY_PLATFORM.downcase
      save_dir = "./#{name.tosjis}"
    else
      save_dir = "./#{name}"
    end
    if File.exist?(save_dir)
      puts "Directory exist"
    else
      puts "Create Directory"
      Dir.mkdir(save_dir)
    end
    return save_dir
  end


  private
  #Cookie取得
  def get_cookie(pixiv_id, pixiv_pass, user_agent, referer)
    Net::HTTP.start('www.pixiv.net', 80) do |http|
      response = http.post('/index.php',
                           "mode=login&pixiv_id=#{pixiv_id}&pass=#{pixiv_pass}",
                           'User-Agent' => user_agent
                          )
      if disp_error(response) == true
        cookie = response['Set-Cookie'].split(',')
        @header = {
          'User-Agent' => user_agent,
          'Referer'    => referer,
          'Cookie'     => cookie[1]
        }
        return true
      else
        return false
      end
    end
  end

  #Error表示
  def disp_error(response)
    case response
    when Net::HTTPBadRequest
      puts 'Error 400'
    when Net::HTTPUnauthorized
      puts 'Error 401'
    when Net::HTTPForbidden
      puts 'Error 403'
    when Net::HTTPNotFound
      puts 'Error 404'
    when Net::HTTPInternalServerError
      puts 'Error 500'
    when Net::HTTPServiceUnavailable
      puts 'Error 503'
    else
      return true
    end
    return false
  end
end

main.rb

#! ruby -Ku
require 'yaml'
require 'pixiv'
CONFIG_FILE     = ARGV[0]
SEARCH_WORD     = ARGV[1].toutf8
THRESHOLD_USERS = ARGV[2].to_i
START_PAGE      = 1

config = YAML::load_file(CONFIG_FILE)
pixiv = AccessPixiv.new(config['pixiv']['id'], config['pixiv']['pass'],
                        config['pixiv']['user_agent'], config['pixiv']['referer'])

save_dir = pixiv.make_dir(SEARCH_WORD)
pixiv.search_tag(SEARCH_WORD)

for page_num in START_PAGE..pixiv.total_pages
  puts "Total Images: #{pixiv.total_images}"
  puts "Total Pages : #{pixiv.total_pages}"
  page = pixiv.search_tag(SEARCH_WORD, page_num)
  page.scan(/"(http:\/\/.+\.pixiv\.net\/img\/.+\/(\d+_s\..{3}))".+?\s.+?\s.+?(\d+) users/) do |uri, id, user|
    if user.to_i >= THRESHOLD_USERS
      uri.sub!(/_s/, ''); id.sub!(/_s/, '')
      if data = pixiv.get_file(uri)
        pixiv.save_file(data, "#{save_dir}/#{user}_#{id}")
      end
    end
  end
end

puts 'Done!'

config.yaml

pixiv:
  id  : 'UserID'
  pass: 'PassWord'
  user_agent: 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
  referer: 'http://www.pixiv.net/mypage.php'