Hadoop conferencejapan2011

  • View
    10.733

  • Download
    0

Embed Size (px)

Transcript

  • 1. Hive Ameba -Hadoop Conference Japan 2011-

2. 2 Ameba Pigg Patriot Twitter toutou ID id:ICHIRO 3. 3 Patriot Hive Patriot 4. 5. 5 Ameba 1300 2011/1/5 PV 201012 194.9PV 20111200PV PC 99.9PV MB 95.0PV UU 367UU Pigg 600 2011/1/14 ARPPU 2,121 201012 6. 6 Ameba Pigg 7. 7 Pigg for Android 8. 8 9. Ameba Hadoop 10. 10 Hadoop HDFS 0.13.1 pico Amazon EMR Pig 11. Patriot 12. 12 Patriot 2009 1113 Hadoop Conference Japan 2009 CDHHive 112123 1127 GO 2010 3 7 111 WebUI 13. 13 14. 14 15. 15 Ameba 16. 16 HDFS Map/Reduce Hive Patriot WebUI CIC HUE 17. Hive 18. 18 Hive Hadoop Facebook SQLMap/Red Pig 19. 19 Hive HiveQL SQLMapReduce Derby PatriotMySQL Partition 20. 20 login Partition date=2011-02-23 dev=pc dev=mb date=2011-02-22 Bucket 21. 21 Primitive intfloatdoubleString Complex mapkey-typevalue-type listelement-type structfield-namefield-type 22. 22 DDL CREATE TABLE pigg_login ( time STRING, ameba_id STRING, ip STRING) partitioned by(dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY 't' STORED AS SEQUENCEFILE; Partition TextFile SequenceFile RCFile 23. 23 LOAD DATA (LOCAL) INPATH '/tmp/pigg_login.log' INTO TABLE pigg_login PARTITION(dt='2011-02-22'); dfs mv, put INSERT OVERWRITE TABLE t1 SELECT c1 FROM t2; 24. 24 HiveQL JOINLEFT OUTER JOIN GROUP BY UNION ALL SELECT * FROM t1 JOIN t2 ON(t1.a2 = t2.b2); 25. 25 HiveQL UDF cast, abs, substr UDAF count, sum, max, min, avg percentile(col, p) http://wiki.apache.org/hadoop/Hive/LanguageManual/UDF 26. 26 SerDe Serialization/Deserialization CREATE TABLE test(c1 string, c2 int) ROW FORMAT DELIMITED FIELDS TERMINATED BY 't' LINES TERMINATED BY 'n'; ichiro[tab]16 suzuki[tab]51 c1 ichiro suzuki c2 16 51 27. 27 SerDe Apache add jar 'hive_contrib.jar' CREATE TABLE apachelog (host STRING, identity STRING, user STRING, time STRING, method STRING, resource STRING, proto STRING, status STRING, size STRING, referer STRING, agent STRING, proctime STRING) ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' WITH SERDEPROPERTIES ("input.regex" = "^(.*?) (.*?) (.*?) [(.*?)] "(S+?)(?: +(.*?) +(S*?))?" (.*?) (.*?) "(.*?)" "(.*?)" (.*?)$", "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s"); 28. Patriot 29. 29 3 30. 30 Util Hadoop Hive Hive Job DBMySQL View 31. 31 NNJTSNN CoreCPU16GB RAM 1 Dell R300 DNTT CoreCPU16GB RAM 1TB HDD4 RAID 18 Dell R410 32. 32 Util CoreCPU16GB RAM 1TB HDD4 RAID1 DBWeb/AP Dell R410 33. 33 Web Ext JS3.2.1 HUE1.0.1 Hadoop CDH3 Hadoop0.20Hive0.5 PuppetNagiosGanglia Hinemos 3.2 34. 34 misc Hadoop NameNodeNFS 35. 35 gzipbzip2LZO TextFileSeqenceFile bzip2gzipLZO LZOgzipbzip2 LZOgzipbzip2 PatriotgzipSeqenceFile MapOutputLZO 36. 36 Util Hadoop SCP HDFS get gzip,SeqenceFileHDFS Hive 37. 37 DSL import { service "gyaos" backup_dir "/data/log/gyaos" data { type "scp" mysqlhdfs servers ["172.xxx.yyy.zzz", " 172.xxx.yyy.zzz "] user "cy_httpd" path "/home/cy_httpd/logs/tomcat/lifelog/*.#{$dt}*" limit 10000 } 38. 38 DSL load { type "hive" mysql table { name "game_login" regexp "^[^t]*t([^t]*)tlogin" output "$1" partition :dt => "#{$dt}", :service => "gyaos" } table { name "game_user" regexp "^([^t]*)t([^t]*)tregist_game" output "$2t$1" partition :dt => "#{$dt}", :service => "gyaos" }}} 39. 39 Util Hadoop DBMySQL Hive Job 40. 40 DSL mysql { host "localhost" port 3306 username "patriot-batch" password "xxx database "gyaos" } analyze { name "gyaos_new_user_num_daily" primary "dt" hive_ql "select count(1), '#{$dt}' from game_user where dt='#{$dt}' and service='gyaos'" } analyze { name "gyaos_unregist_user_num_daily" primary "dt" hive_ql "select count(1), '#{$dt}' from game_user g join ameba_member a on (g.ameba_id = a.ameba_id) where a.unregist_date '' and to_date(a.unregist_date)='#{$dt}' and g.service='gyaos'" } 41. 41 Pigg UNION ALL game_login Partition ID 42. 42 UDF public class ConvertAge extends UDF { public Integer evaluate(String birth, String ym) { if (birth == null || ym == null) { return -1; } int ageRange = -1; // ageRange return ageRange; } } 43. 43 HiveQL Pigg.GB .GB 44. Web CIC 45. 45 46. 46 47. 47 48. HUE 49. 49 Beeswax HiveQLWEB UI 50. 50 HUE Hive HDFS HUE MySQL Hive HDFS 51. 51 WebHiveQL HiveQL select count(distinct m. ameba_id) from pigg_enter m join pigg_enter p on m.ameba_id = p.ameba_id where m.dt like '2010-10-__' and p.dt like '2010-11-__' 52. 52 HBase CDH3b4 Flume 53. 53 Ameba Technology Laboratory 54.