ruby基本库常见用法

时间：2020-06-30 14:12:13 阅读：60 评论：0 收藏：0 [点我收藏+]

我写了ruby脚本也有一段时间了。ruby某些库的api的说明不是那么全，特别是socket。我在这个博客把一些ruby基本库的用法节选下来吧。代码本身的逻辑都是能运行的，而附属的一些yaml数据没有放上来，所以代码无法直接运行。

写这个博客是希望能对某些ruby新手有所帮助。目前ruby的中文教程确实不怎么多，且重复的不少。就我看来，ruby既不是太好，也不是太坏，当你熟悉它的语法和“pass by object reference”后，就会不自然地产生对它的爱和恨。本文主要写一些方便快速完成任务的实用性的东西。

一、Herbern m3u8下载器

用到了ruby的：

OpenURI:http下载库，注意设置header；io = URI.open(link, hash)，这里的hash约等于http发送get/post请求时的header，如果在hash里指定 :encoding = ‘utf-8‘，则open-uri会以utf8编码读取下载的流，否则默认按ascii编码来读取。

socket (windows运行)的读写，TCPServer和TCPSocket,注意其close_write和read(Int)的用法；

Set[] 集合： require ‘set‘; a = Set[3,4,5]; a.to_a # => [3,4,5]; [5,6,7].to_set #=> #<Set: {5, 6, 7}>; a.include?(5) #=> true; a << 2; #=> #<Set: {3, 4, 5, 2}>; a.delete(5) #=> #<Set: {3, 4, 2}>

JSON库基本用法(Psych yaml库的用法类似)；

Thread.new {}; 多线程; Mutex和Queue的用法参见“runoob菜鸟教程”。

类似于Python的if __name__ == "main" 的如本文件为启动脚本则执行的 if ($0 == __FILE__)用法，和ARGV:启动参数数组

简单的正则表达式用法：a = /(\d+)(abc)de/.match(‘12abcdef‘) # => #<MatchData "12abcde" 1:"12" 2:"abc">; a.to_a = [‘12‘, ‘abc‘]

  1 # m3u8下载器 赫本
  2 require ‘open-uri‘
  3 require ‘fileutils‘
  4 require ‘socket‘
  5 require ‘json‘
  6 #~ require ‘sinatra/base‘
  7 # mlink指m3u8的http路径
  8 
  9 HEADER = {
 10   ‘user-agent‘ => ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36‘,
 11   ‘collection‘ => ‘keep-alive‘
 12 }
 13 
 14 def parse_links(mfile, mlink)
 15 
 16   fio2 = File.open(mfile, ‘r‘)
 17   lines = fio2.readlines.map{|x| x.chomp}
 18   fio2.close
 19   lines.reject!{|x| x==‘‘ || x[0] == ‘#‘}
 20   raise if lines.empty?
 21   
 22   infos = []
 23   
 24   if (lines[0][0] == ‘/‘)
 25     # 模式:域名+每一行的内容
 26     pos1 = mlink.index(‘//‘)+2
 27     host = mlink[0, mlink.index(‘/‘, pos1)]
 28     lines.each do |line|
 29       elems = line.split(‘/‘)
 30       fn = elems[-1]
 31       link = host + line
 32       infos << {‘fn‘ => fn, ‘link‘ => link}#, :size => -1}
 33     end
 34   else
 35     # 模式:链接+文件名
 36     tmp = mlink.split(‘/‘)
 37     tmp.delete_at(-1)
 38     link_model = tmp.join(‘/‘)
 39     lines.each do |line|
 40       fn = line
 41       link = link_model + ‘/‘ + fn
 42       infos << {‘fn‘ => fn, ‘link‘ => link}#, :size => -1}
 43     end
 44   end
 45   
 46   return infos
 47   
 48 end
 49 
 50 $cache_file = ‘cache.json‘
 51 
 52 def socket_worker(upload_msg, recv = false)
 53   begin
 54     cli = TCPSocket.new(‘127.0.0.1‘, 6262)
 55     cli.print upload_msg
 56     if recv
 57       cli.close_write
 58       ret = cli.read(30)
 59       return ret
 60     end
 61     cli.close
 62      
 63   rescue StandardError 
 64     puts "无法与hb_server()通讯"
 65     exit
 66   end
 67 
 68 end
 69 
 70 def read_cache
 71   if File.exist?($cache_file)
 72     puts "从cache读取信息"
 73     fio2 = File.open($cache_file)
 74     infos = JSON.load(fio2.read)
 75     fio2.close
 76     return infos
 77   else
 78     raise StandardError, ‘无法读取缓存信息‘
 79   end
 80 
 81 end
 82 
 83 def new_task(mlink)
 84   
 85   tmp = URI.parse(mlink)
 86   unless (tmp.is_a?(URI::HTTP) || tmp.is_a?(URI::HTTPS))
 87     raise StandardError, "mlink格式不对:‘#{mlink}‘"
 88   end
 89   begin
 90     io = URI.open(mlink, HEADER)
 91     fc = io.read
 92   rescue StandardError => ser
 93     puts "下载m3u8文件信息错误:‘#{ser.message}‘"
 94     exit
 95   end
 96   File.open(‘index.m3u8‘, ‘w‘) do |fio|
 97     fio.puts "##{mlink}"
 98     fio.puts fc
 99   end
100   
101   infos = parse_links(‘index.m3u8‘, mlink)
102   File.open(‘cache.json‘, ‘w‘) do |fio|
103     fio.print infos.to_json
104   end
105   download_worker(infos)  
106 end
107 
108 def generate_task_name
109   # 假设进程的pid是唯一的
110   name = "#{Process.pid}"
111   socket_worker("add:#{name}")
112   return name
113 end
114 
115 def download_worker(infos)
116   check_dir
117   
118   # 生成任务名
119   task_name = generate_task_name
120   puts "任务名:‘#{task_name}‘"
121 
122   infos.reject!{|x| File.exist?(x[‘fn‘])}
123   puts "共需下载#{infos.size}个文件"
124 
125   err_fns = []
126 
127   infos.each do |info|
128     # 连接hb_server()以检查是否要停止任务
129     ret = socket_worker("status:#{task_name}", true)
130     #~ puts "ret=#{ret}"; STDIN.gets
131     break if (ret != ‘true‘)
132     
133     fn = info[‘fn‘]
134     link = info[‘link‘]
135     th = Thread.new do
136       begin
137         io = URI.open(link, HEADER)
138         fc = io.read
139         File.open(fn, ‘wb‘) do |fio|
140           fio.print fc
141         end
142         File.open(‘../filestats.txt‘, ‘a‘) do |fio3|
143           fio3.puts "#{fn}||#{fc.size}"
144         end
145       rescue StandardError => ser
146         msg = "下载#{fn}文件错误:‘#{ser.message}‘"
147         err_fns << msg
148       end
149     end
150     time = 0
151     while (th.alive? && time <= 30)
152       sleep 1
153       time += 1
154     end
155     if (time >= 30)
156       msg = "下载#{fn}文件超时"
157       err_fns << msg
158     end
159     
160     th.kill
161   end  
162     
163   # 确保全部保存
164     
165   Dir.chdir(‘..‘)
166   unless err_fns.empty?
167     puts "下载出错数:#{err_fns.size}"
168     File.open(‘err.txt‘, ‘w‘) do |fio|
169       err_fns.each do |str|
170         fio.puts str
171       end
172     end
173   end
174   
175   # 退出时告知hb_server()
176   socket_worker("kill:#{task_name}")
177 
178   puts "下载流程完成:#{tn = Time.now; tn.min.to_s + ‘:‘ + tn.sec.to_s}"  
179   sleep(1)
180 end
181 
182 def check_dir
183   Dir.mkdir(‘./ts‘) unless Dir.exist?(‘./ts‘)
184   Dir.chdir(‘./ts‘)
185 end
186 
187 
188 def make_file
189   infos = read_cache
190 
191   # 读取下载了的文件的信息
192   fstats = {}
193   File.open(‘filestats.txt‘, ‘r‘) do |fio|
194     lines = fio.readlines.select{|x| x.include?(‘||‘)}
195     lines.each do |ln|
196       fn, size = ln.split(‘||‘)
197       fstats[fn] = size.to_i # 后面重复的记录会覆盖掉往前面的记录
198     end
199   end
200   
201   # 合并文件
202   check_dir
203   
204   fio2 = File.open(‘bin_output.ts‘, ‘wb‘)
205   infos.each do |x|
206     fn = x[‘fn‘]
207     unless File.exist?(fn)
208       raise "‘#{fn}‘文件不存在"
209     end
210     
211     if (fstats[fn] == nil)
212       puts "未记载‘#{fn}‘的文件信息"
213     elsif (fstats[fn] != File.size(fn))
214       raise "‘#{fn}‘文件大小与记录信息不符"
215     end
216     
217     File.open(fn, ‘rb‘) do |fio|
218       fio2.print fio.read
219     end
220   end
221   fio2.close
222   
223   Dir.chdir(‘..‘)
224   FileUtils.mv(‘./ts/bin_output.ts‘, ‘./bin_output.ts‘)
225   
226   File.open(‘ffm.cmd‘, ‘w‘) do |fio|
227     fio.puts "ffmpeg -i bin_output.ts -c:v copy -c:a copy 00result.mp4"
228   end
229   
230   puts ‘已输出ffmpeg命令‘
231 
232 end
233 
234 if ($0 == __FILE__)
235 
236   case ARGV[0]
237   when nil
238     infos = read_cache
239     download_worker(infos)
240   when ‘new‘
241     print ‘输入m3u8link>‘
242     str = STDIN.gets
243     new_task(str.chomp)
244   when ‘make‘
245     make_file
246   #~ when ‘save‘
247     #~ file_saver
248   when ‘server‘
249     tcp_server
250   else
251     raise ‘不支持的命令‘
252   end
253 
254 end

赫本的socket服务器，用来管理各个任务的状态

 1 require ‘socket‘
 2 require ‘set‘
 3 
 4 $tasks = Set[]
 5 
 6 server1 = TCPServer.new(‘127.0.0.1‘, 6262) 
 7 # 新建任务 ‘add:446‘, ‘kill:4664‘
 8 #~ server2 = TCPServer.new(‘127.0.0.1‘, 6263) 
 9 # 查询任务状态
10 
11 
12 Thread.new do
13   loop do
14     cli = server1.accept
15     str = cli.read(30)
16     mt = /(\w+):(\d+)/.match(str)
17     if (mt != nil)
18       action = mt[1]
19       task_name = mt[2]
20       case action
21       when ‘add‘
22         if  $tasks.include?(task_name)
23           puts "意外:重复的pid任务:#{task_name}"
24         else
25           $tasks << task_name
26         end
27       when ‘kill‘
28         $tasks.delete(task_name)
29       when ‘status‘
30         if $tasks.include?(task_name)
31           cli.print ‘true‘
32         else
33           cli.print ‘false‘
34         end
35       end
36     #~ else
37       #~ cli.puts ‘illegal‘
38     end
39     cli.close
40   end
41 end
42 
43 #~ sleep(1)
44 
45 puts "输入命令:"
46 
47 while true
48   print ‘>>‘
49   uip = STDIN.gets
50   uip.chomp!
51   if (uip == ‘exit‘)
52     server1.close
53     exit
54   elsif (uip == ‘list‘)
55     puts "任务列表:[#{$tasks.to_a.join(‘,‘)}]"
56   elsif (uip == ‘irb‘)
57     require ‘irb‘
58     binding.irb
59   else
60     mt = /^x(\d+)/.match(uip) # ‘x1463166‘ 表示结束任务名:1463166
61     if mt
62       if $tasks.include?(mt[1])
63         $tasks.delete mt[1]
64         puts ‘已标记结束‘
65       else
66         puts "不存在任务名:#{mt[1]}"
67       end
68     else
69       puts "不认识的命令:‘#{uip}‘"
70     end
71   end
72 end

二、某招聘网站的爬虫

（这个代码写得较早，所以有点乱）

用到了ruby的：

Nokogiri：xml解析库；

SQLite3: sqlite3数据库的用法，

.results_as_hash: 把结果以哈希形式输出；

sql.query(‘begin‘); sql.query(‘commit‘)：SQL事务；

sqlite的转义用法：sql.query(‘update table set a = ? where b = ?‘, [a_value, b_value])；

ERB：ruby的类似于php的模板引擎的用法，ERB.new(string).result(binding)；ERB的模板用法见官方文档。

String：index(pos, startpos)用法；encode(x, y, z)用法（见代码）

ruby元编程的基本用法：Kernel/Object.send(:method, args)； require和load的用法；

获取脚本自身完整路径的方法：File.expand_path(File.dirname(__FILE__))

  1 require ‘psych‘
  2 require ‘open-uri‘
  3 require ‘erb‘
  4 require ‘sqlite3‘
  5 require ‘nokogiri‘
  6 require ‘set‘
  7 
  8 
  9 =begin
 10   使用方法:
 11   rw = RecruitWalker.new()
 12   rw.prepare_crawl
 13   rw.add_task(‘job51‘)
 14   rw.add_task(‘shundehr‘)
 15   rw.begin_tasks
 16   rw.write_html
 17 =end
 18 
 19 
 20 class TseekError < StandardError; end # tseek()找不到文本
 21 
 22 module TextTools
 23 
 24   def tseek(text_src, flag_begin, flag_end)
 25     # 在源文本里寻找flag_begin文本与flag_end文本之间的文本
 26     # 寻找tm次
 27     unless text_src.kind_of?(String)
 28       raise TseekError, "传入参数[0]不是文本"
 29     end
 30     result = []; bp = 0 # 搜索的初始位置
 31     #~ tm.times do
 32     lt = text_src.index(flag_begin, bp)
 33     #~ break if lt.nil?
 34     raise TseekError, "找不到初始文本" if lt.nil?
 35     rt = text_src.index(flag_end, lt+bp)
 36     raise TseekError, "找不到结尾文本" if rt.nil?
 37     #~ break if rt.nil?
 38     pos = lt + flag_begin.size
 39     len = rt - pos
 40     r = text_src[pos, len] # 截取文本
 41     return r
 42     
 43   end
 44   module_function :tseek
 45   
 46   def utf8_cvt(text, enc=‘gbk‘)
 47     # gbk转utf-8
 48     #~ text2 = text
 49     #~ text2.force_encoding(enc)
 50     text3 = text.encode(‘utf-8‘, enc, {:invalid => :replace, :undef => :replace, :replace => ‘?‘})
 51     return text3
 52   end
 53   module_function :utf8_cvt
 54 
 55   def uri_open(link, header={})
 56     # 使用open-uri
 57     begin
 58       io = URI.open(link, header)
 59       c = io.read
 60       io = nil
 61       return c
 62     rescue StandardError => ser
 63       return false
 64     end
 65   
 66   end
 67   module_function :uri_open
 68   
 69   def unixtime2date(unixtime)
 70     # 从unix时间戳获取日期为‘20201212‘的形式
 71     time1 = Time.at(unixtime.to_i)
 72     text = time1.year.to_s
 73     month = time1.month
 74     if (month < 10)
 75       text << ‘0‘ 
 76     end
 77     text << month.to_s
 78     day = time1.day
 79     if (day < 10)
 80       text << ‘0‘ 
 81     end
 82     text << day.to_s
 83     return text
 84   end
 85   
 86   module_function :unixtime2date
 87 end
 88 
 89 
 90 class RecruitWalker
 91 
 92   attr_reader :header, :datadb_fn, :legacydb_fn, :joblist, :late10dates
 93 
 94   def initialize
 95     @path = File.expand_path(File.dirname(__FILE__))
 96     #~ require "#{@path}/texttools.rb"
 97     # 读取配置信息
 98     @datadb_fn = "#{@path}/database/store5.db"
 99     @legacydb_fn = "#{@path}/database/oldjob.db"
100 
101     # 读取招聘网站的配置信息
102     @conf_fn = "#{@path}/ext/info.yml"
103     fc = File.read(@conf_fn)
104     @joblist = Psych.load(fc)
105 
106     @late10dates = read_dates()
107 
108     return 
109   end
110 
111   def prepare_crawl
112     # 准备爬虫流程
113     # 连接sqlite数据库
114     @data_sql = SQLite3::Database.new(@datadb_fn)
115     @data_sql.results_as_hash = true
116     @legacy_sql = SQLite3::Database.new(@legacydb_fn)
117     @task_list = {}
118     # 任务列表形式:
119     # {
120     #   任务英文名1 => {:htmls => [], :infos => []},
121     #   任务英文名2 => {:htmls => [], :infos => []}
122     # } 
123 
124     # 读取公司名黑名单
125     fc2 = File.read("#{@path}/ext/blacklist.yml")
126     @blacklist = Psych.load(fc2)
127     @header = {‘User-Agent‘=>‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36‘, ‘cookie‘=>‘‘}
128     return
129   end
130 
131   def read_dates
132     # 获取最近的10个日期
133     tn = Time.now
134     time_array = []
135     10.times do
136       time_array << tn.clone
137       tn -= 24*3600
138     end
139     dates_array = []
140     time_array.each do |tm|
141       day = tm.to_s.split(‘ ‘)[0].gsub(‘-‘, ‘‘)
142       dates_array << day
143     end
144     return dates_array # 字符串数组
145   end
146   
147   def add_task(name)
148     if @joblist.has_key?(name)
149       @task_list[name] = {:htmls => [], :infos => []}
150     else
151       raise StandardError, "找不到任务名:‘#{name}‘"
152     end
153   end
154   
155   def begin_tasks
156     # 开始抓取任务列表中的所有任务名
157     # 并行抓取各个任务的网页;数据整理和SQL整理则排队处理
158     return if @task_list.empty?
159     puts "任务流程开始,数量:#{@task_list.keys.size},时间‘#{Time.now}‘"
160     # 下载网页
161     web_threads = []
162     @task_list.each_pair do |k, v|
163       th = Thread.new do
164         v[:htmls] = scan_web_task(k, @joblist[k])
165       end
166       web_threads << th
167     end
168     # 等待所有的下载完成
169     web_threads.map{|x| x.join}
170     
171     # html分析
172     @task_list.each_pair do |k, v|
173       method = @joblist[k][‘scanner‘].to_sym
174       puts "加载分析库文件:#{k}"
175       load "#{@path}/ext/use-#{k}.rb"
176       puts "分析htmls:开始:‘#{@joblist[k][‘text‘]}‘"
177       tmp_keys = Set.new
178       v[:htmls].each do |html|
179         tmp_list = send(method, html)
180         tmp_list.each do |x|
181           unless tmp_keys.include?(x[‘jobid‘])
182             v[:infos] << x
183             tmp_keys << x[‘jobid‘]
184           end
185         end
186         #~ v[:infos] 
187       end
188       puts "分析htmls:完成:‘#{@joblist[k][‘text‘]}‘"
189     end
190     
191     # 数据迁移
192     @task_list.each_pair do |k, v|
193       data_migrate(@joblist[k][‘table‘])
194     end
195     
196     # 数据更新和写入
197     @task_list.each_pair do |k, v|
198       data_tidy(@joblist[k], v[:infos])
199     end
200     
201     # 保存抓取时间
202     update_conf
203     puts "任务流程完成."
204     return
205   end
206 
207   def scan_web_task(name, conf)
208 
209     puts "执行下载:‘#{conf[‘text‘]}‘"
210 
211     htmls = []
212     p_begin = conf[‘page_begin‘].to_i
213     p_end = conf[‘page_end‘].to_i
214     
215     (p_begin .. p_end).each do |pid|
216       link = conf[‘link‘].sub(‘{{pageid}}‘, pid.to_s)
217       html = TextTools::uri_open(link, @header)
218       if (html == false)
219         puts "下载‘#{name}‘的第#{pid}页索引出现http错误"
220         next
221       end
222       if (conf[‘gbk‘] == ‘true‘)
223         html = TextTools::utf8_cvt(html)
224       end
225       htmls << html
226       sleep(rand(2))
227     end
228     puts "下载完成:‘#{conf[‘text‘]}‘"
229     return htmls
230 
231   end
232 
233   def update_conf
234     # 写入抓取时间到yaml文件
235     tn = Time.now.to_s
236     @task_list.each_pair do |k, v|
237       @joblist[k][‘update_time‘] = tn
238     end
239     File.open(@conf_fn, ‘w‘) do |fio|
240       fio.print @joblist.to_yaml
241     end
242     return
243   end
244 
245   def data_migrate(table)
246     # table是表格名
247     
248     drop_cmds = [] # 从主数据库的对应表格中去除过时信息
249     update_cmds = [] # 把过时信息的jobid插入到oldjob数据库的对应表格
250     cmd = "select jobid, post_date from #{table}"
251     #~ puts "migrate cmd=‘#{cmd}‘"
252     @data_sql.query(cmd).each do |x|
253       jobid = x[‘jobid‘]; p_date = x[‘post_date‘]
254       unless @late10dates.include?(p_date)
255         drop_cmds << {:cmd => "delete from #{table} where jobid = ?", :params => [jobid]}
256         update_cmds << {:cmd => "insert into #{table} (jobid, post_date) values (?, ?)", :params => [jobid, p_date]}
257       end
258     end
259     
260     @data_sql.query(‘begin‘)
261     drop_cmds.each do |c|
262       @data_sql.execute(c[:cmd], c[:params])
263     end
264     @data_sql.query(‘commit‘)
265     
266     @legacy_sql.query(‘begin‘)
267     update_cmds.each do |c|
268       @legacy_sql.execute(c[:cmd], c[:params])
269     end
270     @legacy_sql.query(‘commit‘)
271     
272     return
273   end
274 
275   def data_tidy(conf, src_infos)
276     table = conf[‘table‘]
277     # 获取过时的jobid
278     outdated_jobids = []
279     @legacy_sql.query("select jobid from #{table}").each do |x|
280       outdated_jobids << x[0]
281     end
282     
283     black_count = 0 # 已用公司名黑名单屏蔽的招聘信息数
284     
285     infos = src_infos.select{  |x|
286      
287       bool1 = @late10dates.include?(x[‘post_date‘]) # 去除时间范围以外的招聘信息
288       bool2 = !@blacklist.include?(x[‘company‘]) # 去除黑名单公司的招聘信息
289       black_count += 1 if (bool2 == false)
290       bool3 = !outdated_jobids.include?(x[‘jobid‘]) # 去除在oldjob数据库对应表格里已包含jobid的招聘信息
291       (bool1 && bool2 && bool3)
292     }
293     
294     puts "已根据公司名黑名单,屏蔽了#{black_count}条招聘信息" if (black_count>0)
295     puts "数据整理:开始:‘#{conf[‘text‘]}‘"
296     tn1 = Time.now
297      # 筛选当前的招聘嘻嘻
298     
299     lately_jobids = []
300     
301     @data_sql.query("select jobid from #{table}").each do |x|
302       lately_jobids << x[‘jobid‘]
303     end
304 
305     cmds = []
306     update_count = 0
307     insert_count = 0
308 
309     infos.each do |info|
310       # 处理每个新下载的招聘信息
311       set = {}
312       if lately_jobids.include?(info[‘jobid‘])
313         # 时间范围内的招聘信息数据库已包含相同jobid的信息
314         set[:cmd] = "update #{table} set update_date = ? where jobid = ?"
315         set[:params] = [info[‘post_date‘], info[‘jobid‘]]
316         update_count += 1
317       else
318         set[:cmd] = "insert into #{table} (jobid,post_date,company,link,name,srcsite,update_date) values (?, ?, ?, ?, ?, ?, ?)"
319         set[:params] = [
320           info[‘jobid‘],
321           info[‘post_date‘],
322           info[‘company‘],
323           info[‘link‘],
324           info[‘name‘],
325           info[‘srcsite‘],
326           info[‘update_date‘]
327         ]
328         insert_count += 1
329       end
330       cmds << set
331     end
332 
333     puts "更新:#{update_count}", "插入:#{insert_count}"
334     puts ‘向数据库添加新信息..‘
335     #~ tn1 = Time.now
336     @data_sql.query(‘begin‘)
337     cmds.each do |c|
338       @data_sql.query(c[:cmd], c[:params])
339     end
340     @data_sql.query(‘commit‘)
341     
342     #~ tn2 = Time.now
343     #~ puts "数据库整理用时:#{(tn2-tn1).round(1)}秒"
344     return 
345   end
346 
347   def write_html
348     # 清除上次保存的html文件
349     Dir.chdir("#{@path}/html")
350     Dir.glob(‘*.html‘).each do |fn|
351       File.unlink(fn)
352     end
353     Dir.chdir(@path)
354     # 写入欢迎页
355     fc1 = File.read("#{@path}/template2/index.erb")
356     dates_text = @late10dates.map{|x| "‘#{x}‘"}.join(‘,‘)
357     sites_text = @joblist.map{|k, v|
358       "{‘name‘:‘#{k}‘, ‘text‘:‘#{v[‘text‘]}‘, ‘upd_time‘:‘#{v[‘update_time‘]}‘}"
359 
360     }.join(‘,‘)
361 
362     html1 = ERB.new(fc1).result(binding)
363     File.open("#{@path}/html/index.html", ‘w‘) do |fio|
364       fio.print html1
365     end
366 
367     # 写入各个招聘网信息按日期的详细页面
368     #~ @data_sql
369     fc2 = File.read("#{@path}/template2/details.erb")
370 
371     #~ puts "日期列表:#{dates}, joblist=#{rw.joblist}"
372     @late10dates.each do |date|
373       @task_list.each_pair do |k, v|
374         #~ puts "正在写入:#{k}, #{date}"
375         conf = @joblist[k]
376         site_text = conf[‘text‘]
377         table = conf[‘table‘]
378 
379         infos = []
380         cmd = "select * from #{table} where post_date = ‘#{date}‘"
381         #~ puts "sql cmd=#{cmd}"
382         @data_sql.query(cmd).each do |x|
383           infos << x # 不考虑:把相同的公司的信息放在一起
384         end
385         count = infos.size # erb里用到这个变量
386         #~ puts "检查infos:共#{count}个,[0]=", infos[0].inspect, ‘‘
387         html2 = ERB.new(fc2).result(binding)
388         fn = "#{@path}/html/info-#{k}-#{date}.html"
389         File.open(fn, ‘w‘) do |fio|
390           fio.print html2
391         end
392       end
393     end
394 
395     puts "写入流程:完成."
396     return
397   end
398 
399 end

Nokogiri的部分用法见以下，注意Nokogiri::XML用来解析常规的xml，Nokogiri::HTML支持html里的<image>、<input>等没有结束标记的标签；Nokogiri的参数‘utf-8‘在windows上最好强制指定，要不然Nokogiri会以windows终端的GBK编码来解析。

Nokogiri有xpath(‘./node‘)的用法，识别<a href="link">text</a>是用xpath(‘./a/@href‘).text。

Nokogiri还有.to_html；.to_s；.text；.value用法，代码暂时找不到了，建议结合实际使用。

用Nokogiri的时候最好逐步调试，如果nokogiri找不到元素，有时候会抛出异常，有时候会返回nil。

Enumerable.any?用法：[3, 4, 5].any?{|x| x %2 == 0} # => true

 1  def index_html_analysis(html1, shared)
 2     # 分析html,获取帖子的大部分信息(除主楼内容)
 3     # 每个帖子的路径是<root><__T><item>
 4     results = []
 5     nk = Nokogiri::XML(html1, nil, ‘utf-8‘)
 6     thrs = nk.xpath(‘/root/__T/item‘)
 7     thrs.each do |node|
 8 
 9       # 如果帖子是论坛活动帖,则跳过
10       board_obj = node.xpath(‘./parent/item‘)[1]
11       if board_obj # 根据[][1]==nil来判定,如果<item>存在子元素,则这里的board_obj不为nil
12         str = board_obj.text
13         unless shared.boards.include?(str)
14           next
15         end
16       end
17 
18       tid = node.xpath(‘./tid‘).text.to_i
19       title = node.xpath(‘./subject‘).text
20  
21       next if shared.omit_titles.any?{|str| title.include?(str)}
22 
23       results << tid if (tid>0) # 筛选一些错误的id,英文转数字为0
24     end
25     return results # 目前返回的仅仅是帖子的tid
26 
27   end

ruby基本库常见用法

原文：https://www.cnblogs.com/uu6crella/p/13213048.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)