diff --git a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java index a15baf91b322eb31f6fe59c20b424f7557843c1b..03c0173d35217002899397c7507e81e583b36ab6 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java +++ b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java @@ -131,7 +131,7 @@ public class CodeRelationMapper { if (is_ClsName) { for (String ClsCode_fullName : ClsCode_fullName_set) { // 对每个java文件对应的full name,获取其short name - String ClsCode_shortName = getClsShortNameFromFullName(ClsCode_fullName); + String ClsCode_shortName = UMLClass.getClsShortNameFromFullName(ClsCode_fullName); // 如果Ent_doc.name与Cls_code的short name相似,再进行属性、方法的比较,否则直接pass if (EntName_SimilarWith_ClsShortName(Ent_doc.name.substring(6), ClsCode_shortName, 1.0)) { Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, 1.0)); @@ -141,7 +141,7 @@ public class CodeRelationMapper { // 遍历一遍全体java文件,找所有与Ent_doc相似的Cls_code for (String ClsCode_fullName : ClsCode_fullName_set) { // 对每个java文件对应的full name,获取其short name - String ClsCode_shortName = getClsShortNameFromFullName(ClsCode_fullName); + String ClsCode_shortName = UMLClass.getClsShortNameFromFullName(ClsCode_fullName); String ClsCode_parentPackage = getParentPackageFromFullName(ClsCode_fullName); // 如果Ent_doc.name与Cls_code的short name相似,再进行属性、方法的比较,否则直接pass // 此处的阈值会影响准确率。建议在0.5左右 @@ -177,13 +177,6 @@ public class CodeRelationMapper { } } - /** - * 通过类全称,获得类短称 - */ - private static String getClsShortNameFromFullName(String clsCode_fullName) { - return clsCode_fullName.substring(clsCode_fullName.lastIndexOf(".") + 1); - } - /** * 通过类全称,获得类的父包 */ @@ -401,7 +394,7 @@ public class CodeRelationMapper { } else { try { CompilationUnit possibleMapped_ClsCode = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()))); - String cls_shortName = getClsShortNameFromFullName(possibleMapped_javaFile_pair.getLeft()); + String cls_shortName = UMLClass.getClsShortNameFromFullName(possibleMapped_javaFile_pair.getLeft()); /* * 如果一个Cls_code与某个由related_Ent追踪到的Cls_code'有同类关系,则该Cls_code就是“根据关系推理追踪到的代码文件”。 * @@ -618,7 +611,7 @@ public class CodeRelationMapper { if (source_class_imports.isNonEmpty()) { for (ImportDeclaration import_declaration : source_class_imports) { String im_full_string = import_declaration.getNameAsString(); - String im_short_class_name = getClsShortNameFromFullName(im_full_string); + String im_short_class_name = UMLClass.getClsShortNameFromFullName(im_full_string); if (im_short_class_name.equals(class_short_name)) { // 直接将result替换为imports result = im_full_string; diff --git a/src/main/java/com/hy/java/uct/sdtocode/SDToCodeTracer.java b/src/main/java/com/hy/java/uct/sdtocode/SDToCodeTracer.java index 0d9b20bf4ddf38a1921f760fd51609457def763f..cdfdd34f61f04cb5214182a98d9ec9547f1b5cde 100644 --- a/src/main/java/com/hy/java/uct/sdtocode/SDToCodeTracer.java +++ b/src/main/java/com/hy/java/uct/sdtocode/SDToCodeTracer.java @@ -1,21 +1,20 @@ package com.hy.java.uct.sdtocode; -import java.util.ArrayList; import java.util.List; import java.util.Map; -import com.hy.java.uct.sdtocode.mapper.CodeMessageMapper; -import com.hy.java.uct.sdtocode.mapper.DocAnalyzer; +import com.hy.java.uct.sdtocode.mapper.CodeMessageTracer; import com.hy.java.uct.sdtocode.reader.CodeReader; import com.hy.java.uct.sdtocode.reader.DocReader; import com.hy.java.uct.sdtocode.reader.SDReader; +import com.hy.java.uct.util.sd.Message; import com.hy.java.uct.util.sd.UMLObject; +import com.hy.java.utility.common.Pair; /** * 顺序图的追踪分为两类:1、以对象为主。2、以消息为主。最后设计一种方法综合两类结果。 * * 消息必须按顺序记录,且需将return配对。return中的数据类型可以为正向消息的追踪提供参考。 - * */ public class SDToCodeTracer { /** @@ -44,96 +43,55 @@ public class SDToCodeTracer { public static void main(String[] args) { /* * 1、读取模型信息 + * + * 读取完UML图识别结果后,将实体信息保存在objs_in_SD里。形式为Pair, List> + * + * 消息必须按顺序记录,且需将return配对。return中的数据类型可以为正向消息的追踪提供参考。 */ - // 读取完UML图识别结果后,将实体信息保存在classes_in_CD里。形式为 - /* - * Hadoop HDFS1 - */ - Map objs_in_SD = SDReader.read(sd_dir + "sd-Hadoop HDFS1.txt"); - /* - * Hadoop MapReduce - */ - // Map classes_in_CD = CDReader.read(cd_dir + "cd-Hadoop MapReduce.txt"); + // Hadoop HDFS + // Pair, List> objs_in_SD = SDReader.read(sd_dir + "sd-Hadoop HDFS1.txt"); + Pair, List> objs_in_SD = SDReader.read(sd_dir + "sd-Hadoop HDFS2.txt"); + // Hadoop MapReduce + // Pair, List> objs_in_SD = SDReader.read(sd_dir + "sd-Hadoop MapReduce.txt"); // 检查结果,可注释掉 - SDReader.check(objs_in_SD); + // SDReader.check(objs_in_SD); /* - * 2、读取文档信息 + * 2、读取类图追踪结果 * - * 做“自己方法内有无文档的对比”的实验时,在不导入任何文档即可 - */ - List doc_dir_ls = new ArrayList<>(); - // 在这儿添加多个文件 - /* - * Hadoop HDFS - */ - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop architectural overview.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop clusters with Kove® XPD™ persistent memory.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop Distributed File System (HDFS) Architecture – A Guide to HDFS for Every Data Engineer.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HADOOP ECOSYSTEM.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop HDFS Architecture Explanation and Assumptions.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HDFS Architecture Guide.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HDFS Architecture.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HDFS.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Key Design of HDFS Architecture.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\The Hadoop Distributed File System Architecture and Design.txt"); - doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Towards A Scalable HDFS Architecture.txt"); - /* - * Hadoop MapReduce + * 将对象分解 */ - /* - * doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\Apache Hadoop Architecture – HDFS, YARN & MapReduce.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Big Data Analysis Challenges and Solutions.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\Big Data Management on Wireless Sensor Networks.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Hadoop - MapReduce.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\Hadoop Architecture in Detail – HDFS, Yarn & MapReduce.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Hadoop MapReduce- Java-based Processing Framework for Big Data.txt"); - * doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce – Components.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Architecture1.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\MapReduce Architecture2.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Architecture3.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Tutorial.txt"); - * doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Working and Components.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\mapreduce_hadoop2.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Understanding MapReduce in Hadoop.txt"); doc_dir_ls.add(doc_dir + - * "Hadoop MapReduce\\What are the components of MapReduce.txt"); doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\What Is MapReduce Architecture An Important Overview For 2021.txt"); - */ - // 实际使用的Map,保存每份文档地址及其内容 - Map> dir_sentences_map = DocReader.readDocs(doc_dir_ls); + // Hadoop HDFS + DocReader.readDoc(doc_dir + "Hadoop HDFS\\Hadoop HDFS.xls", objs_in_SD); + // Hadoop MapReduce + // DocReader.readDoc(doc_dir + "Hadoop MapReduce\\Hadoop MapReduce.xls", objs_in_SD); + // DocReader.check(objs_in_SD); /* * 3、读取code path指定的目录下所有java文件 * * <类全称(包+类名), java_file_path> */ - // 记得改这里的路径 - // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-hdfs"); - Map classFullName_javaFileDir_map = null; + Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-hdfs"); // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-mr"); // 检查结果,可注释掉 // CodeReader.check(classFullName_javaFileDir_map); /* - * 4、分析文档信息。实际相当于增加类图中的UMLclass、类本身的内容、类之间关系 - * - * 核心思想:我们的目标是以模型为准,将模型元素追踪到代码。由于模型元素抽象层次较高,所以直接追踪未必追踪的到,因此需找与模型元素具有关系的其他元素(向下一层层找),将模型元素降维、直至能与代码元素对应上为止。 + * 4、开始追踪 * - * 以下追踪实际包含3层:图中的类、文档中的实体、代码中的类。 + * 对每条消息,先辨别其消息内容是否包含多个标识符。如果只有单个标识符、且该标识符是一个方法,则依次用前两种即可;如果有多个标识符、或标识符是一个类,则除了使用前两种之外,还需使用第三种追踪。 * - * 先将图中的类追踪到包含文档信息的实体(一对多),再将这些实体追踪到代码中的类(也是一对多),所以是“一→多→更多”。 + * 第一种:1)支撑类中遍历属性,找那种名称或类型与消息另一端语义相似的属性,记录其类型。2)①在支撑类中找该属性的相关操作,对比每个操作与消息的语义相似度。比如new操作可以对应消息内容run。②对属性类型对应的类,找其中与消息内容语义相似的方法。 * - * 其中,实体间还会根据语义分析得到一些关系 - */ - Map classShortName_classObj_mappedByDoc = DocAnalyzer.analyze(objs_in_SD, dir_sentences_map); - // 检查结果,可注释掉 - // DocAnalyzer.check(classShortName_classObj_mappedByDoc); - /* - * 5、遍历模型中的实体元素(类或对象),针对每个元素,在code中寻找能匹配的java文件 + * 第二种:1)支撑类中阅读代码、记录所有方法(包括方法逻辑中用到的其他方法)。2)计算每个方法与消息内容的语义相似度。找到相似的方法。3)①找该方法内是否有对消息另一端的调用,如参数、方法内的变量、语句等。②对与消息相似的方法,找到其所属的类,在这个类里看该方法的返回类型是否与另一端相似。 * - * 基于启发式模糊匹配的UML类图与代码追踪方法:首先针对类图中的类和代码中的类,基于类的名称进行字符串完全匹配,从而建立确定的初始追踪;基于同义词、 词缀词典等语料库,定义基于类名称匹配和关联关系的启发式匹配规则,研究基于模糊匹配技术的追踪关系建立方法,基于初始追踪和启发式规则, 对类名不一致的模型和代码元素进行启发式追踪,扩展初始追踪关系。 - */ - Map mapped_classes = CodeMessageMapper.map(classShortName_classObj_mappedByDoc, classFullName_javaFileDir_map, false); - /* - * Hadoop HDFS - */ - CodeMessageMapper.save(mapped_classes, res_dir + "Hadoop HDFS-noadd.xls"); - /* - * Hadoop MapReduce + * 第三种※:1)解析消息中的内容。将每个标识符对应为一个类或者一个方法。比如第一个标识符必须是类;后面的标识符如果是小写开头,那认为它是某个类型的变量名,所以在前面的标识符所属的类中找具有该名称的变量其所属的类。最终在最后一个标识符那儿找方法名。※注意:存在一种情况,即“前面的标识符所属的类中找不到后面的标识符”。所以应该对每个标识符与其下一个标识符都进行单独的查找,然后记录所有“ + * 能从上一个标识符找到下一个标识符”的链条片段。将所有片段连接起来,断开的部分就直接放上对应的原消息内容即可。2)将由标识符链接起来的链条作为消息的追踪。记录链条两端。3)对比链条两端与消息两端的对象。如果存在支撑类之类的关系,则认为消息两端应该是支撑类;否则仍保留链条两端作为消息两端。 */ - // CodeRelationMapper.save(mapped_classes, res_dir + "Hadoop MapReduce-noadd.xls"); + List traced_msgs = CodeMessageTracer.trace(objs_in_SD, classFullName_javaFileDir_map); + // Hadoop HDFS + CodeMessageTracer.save(traced_msgs, res_dir + "Hadoop HDFS.xls"); + // Hadoop MapReduce + // CodeRelationMapper.save(traced_msgs, res_dir + "Hadoop MapReduce.xls"); // 检查结果,可注释掉 - // CodeRelationMapper.check(res_dir + "Apache OODT File Manager.xls"); + // CodeRelationMapper.check(res_dir + "Hadoop HDFS.xls"); } } diff --git a/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageMapper.java b/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageMapper.java deleted file mode 100644 index fdf9d601669eb5d1eb1363bcfc24050ef8e4ec3c..0000000000000000000000000000000000000000 --- a/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageMapper.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.hy.java.uct.sdtocode.mapper; - -import java.util.Map; - -import com.hy.java.uct.util.sd.UMLObject; - -public class CodeMessageMapper { - - public static Map map(Map classShortName_classObj_mappedByDoc, Map classFullName_javaFileDir_map, boolean b) { - // TODO Auto-generated method stub - return null; - } - - public static void save(Map mapped_classes, String string) { - // TODO Auto-generated method stub - - } - -} diff --git a/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageTracer.java b/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageTracer.java new file mode 100644 index 0000000000000000000000000000000000000000..c0e6bdae7cc5ccf283f25df9ba04a950e9646183 --- /dev/null +++ b/src/main/java/com/hy/java/uct/sdtocode/mapper/CodeMessageTracer.java @@ -0,0 +1,151 @@ +package com.hy.java.uct.sdtocode.mapper; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import com.hy.java.uct.util.sd.Message; +import com.hy.java.uct.util.sd.UMLObject; +import com.hy.java.utility.common.Pair; + +/** + * 对每条消息,先辨别其消息内容是否包含多个标识符。如果只有单个标识符、且该标识符是一个方法,则依次用前两种即可;如果有多个标识符、或标识符是一个类,则除了使用前两种之外,还需使用第三种追踪。 + * + * 第一种:1)支撑类中遍历属性,找那种名称或类型与消息另一端语义相似的属性,记录其类型。2)①在支撑类中找该属性的相关操作,对比每个操作与消息的语义相似度。比如new操作可以对应消息内容run。②对属性类型对应的类,找其中与消息内容语义相似的方法。 + * + * 第二种:1)支撑类中阅读代码、记录所有方法(包括方法逻辑中用到的其他方法)。2)计算每个方法与消息内容的语义相似度。找到相似的方法。3)①找该方法内是否有对消息另一端的调用,如参数、方法内的变量、语句等。②对与消息相似的方法,找到其所属的类,在这个类里看该方法的返回类型是否与另一端相似。 + * + * 第三种※:1)解析消息中的内容。将每个标识符对应为一个类或者一个方法。比如第一个标识符必须是类;后面的标识符如果是小写开头,那认为它是某个类型的变量名,所以在前面的标识符所属的类中找具有该名称的变量其所属的类。最终在最后一个标识符那儿找方法名。※注意:存在一种情况,即“前面的标识符所属的类中找不到后面的标识符”。所以应该对每个标识符与其下一个标识符都进行单独的查找,然后记录所有“ + * 能从上一个标识符找到下一个标识符”的链条片段。将所有片段连接起来,断开的部分就直接放上对应的原消息内容即可。2)将由标识符链接起来的链条作为消息的追踪。记录链条两端。3)对比链条两端与消息两端的对象。如果存在支撑类之类的关系,则认为消息两端应该是支撑类;否则仍保留链条两端作为消息两端。 + */ +public class CodeMessageTracer { + /** + * 对每条消息,共有三种追踪方法:前两种分别针对消息对象;第三种针对特殊的消息。 + */ + public static List trace(Pair, List> objs_in_SD, Map classFullName_javaFileDir_map) { + List res = new ArrayList<>(); + // 逐条追踪。不用追踪返回消息,只用它作为对应正向消息的参考。追踪完正向消息后返回消息的“返回起点”自然就是正向消息的最后一个带返回值的函数。 + List msgs_inSD = objs_in_SD.getRight(); + for (Message msg_inSD : msgs_inSD) { + // 只追踪正向消息,不追踪返回消息 + if (!msg_inSD.is_return) { + /* + * 0、预处理:对每条消息,先辨别其消息内容是否包含多个标识符。 + * + * 如果只有单个标识符、且该标识符是一个方法,则依次用前两种即可;如果有多个标识符、或标识符是一个类,则除了使用前两种之外,还需使用第三种追踪。 + */ + boolean msg_is_complex = check_msg_complex(msg_inSD); + /* + * 1、第一种追踪:针对对象和属性 + * + * 1)支撑类中遍历属性,找那种名称或类型与消息另一端语义相似的属性,记录其类型。 + * + * 2)①在支撑类中找该属性的相关操作,对比每个操作与消息的语义相似度。比如new操作可以对应消息内容run。②对属性类型对应的类,找其中与消息内容语义相似的方法。 + */ + traceByAttri(res, objs_in_SD, classFullName_javaFileDir_map); + /* + * 2、第二种追踪:针对对象和方法 + * + * 1)支撑类中阅读代码、记录所有方法(包括方法逻辑中用到的其他方法)。 + * + * 2)计算每个方法与消息内容的语义相似度。找到相似的方法。 + * + * 3)①找该方法内是否有对消息另一端的调用,如参数、方法内的变量、语句等。②对与消息相似的方法,找到其所属的类,在这个类里看该方法的返回类型是否与另一端相似。 + */ + traceByMethod(res, objs_in_SD, classFullName_javaFileDir_map); + /* + * 3、第三种追踪:针对消息 + * + * 1)解析消息中的内容。将每个标识符对应为一个类或者一个方法。比如第一个标识符必须是类;后面的标识符如果是小写开头,那认为它是某个类型的变量名,所以在前面的标识符所属的类中找具有该名称的变量其所属的类。最终在最后一个标识符那儿找方法名。※注意:存在一种情况,即“前面的标识符所属的类中找不到后面的标识符”。所以应该对每个标识符与其下一个标识符都进行单独的查找,然后记录所有“ + * 能从上一个标识符找到下一个标识符”的链条片段。将所有片段连接起来,断开的部分就直接放上对应的原消息内容即可。 + * + * 2)将由标识符链接起来的链条作为消息的追踪。记录链条两端。 + * + * 3)对比链条两端与消息两端的对象。如果存在支撑类之类的关系,则认为消息两端应该是支撑类;否则仍保留链条两端作为消息两端。 + */ + if (msg_is_complex) { + traceByMsg(res, objs_in_SD, classFullName_javaFileDir_map); + } + } + } + return res; + } + + /** + * 预处理:对每条消息,先辨别其消息内容是否包含多个标识符。 + * + * 如果只有单个标识符、且该标识符是一个方法,则依次用前两种即可;如果有多个标识符、或标识符是一个类,则除了使用前两种之外,还需使用第三种追踪。 + */ + private static boolean check_msg_complex(Message msg_inSD) { + boolean res = false; + if (msg_inSD.msg.split("\\.").length > 1) { + res = true; + } else { + // 如果消息内容只有一个标识符,且不包含括号,且第一个字符是大写字母,则认为它是个类名。这种也算需要“针对消息追踪”的复杂情形 + if (!msg_inSD.msg.contains("()")) { + char c = msg_inSD.msg.charAt(0); + if (c >= 'A' && c <= 'Z') { + res = true; + } + } + } + return res; + } + + /** + * 第一种追踪:针对对象和属性 + * + * 1)支撑类中遍历属性,找那种名称或类型与消息另一端语义相似的属性,记录其类型。 + * + * 2)①在支撑类中找该属性的相关操作,对比每个操作与消息的语义相似度。比如new操作可以对应消息内容run。②对属性类型对应的类,找其中与消息内容语义相似的方法。 + */ + private static void traceByAttri(List res, Pair, List> objs_in_SD, Map classFullName_javaFileDir_map) { + /* + * 来来来来来来来来来来~ + * + * 来来来来来来来来来来~ + * + * 来来来来来来来来~ + * + * 小时候悄悄在路上~ + */ + } + + /** + * 第二种追踪:针对对象和方法 + * + * 1)支撑类中阅读代码、记录所有方法(包括方法逻辑中用到的其他方法)。 + * + * 2)计算每个方法与消息内容的语义相似度。找到相似的方法。 + * + * 3)①找该方法内是否有对消息另一端的调用,如参数、方法内的变量、语句等。②对与消息相似的方法,找到其所属的类,在这个类里看该方法的返回类型是否与另一端相似。 + */ + private static void traceByMethod(List res, Pair, List> objs_in_SD, Map classFullName_javaFileDir_map) { + // TODO Auto-generated method stub + + } + + /** + * 第三种追踪:针对消息 + * + * 1)解析消息中的内容。将每个标识符对应为一个类或者一个方法。比如第一个标识符必须是类;后面的标识符如果是小写开头,那认为它是某个类型的变量名,所以在前面的标识符所属的类中找具有该名称的变量其所属的类。最终在最后一个标识符那儿找方法名。※注意:存在一种情况,即“前面的标识符所属的类中找不到后面的标识符”。所以应该对每个标识符与其下一个标识符都进行单独的查找,然后记录所有“ + * 能从上一个标识符找到下一个标识符”的链条片段。将所有片段连接起来,断开的部分就直接放上对应的原消息内容即可。 + * + * 2)将由标识符链接起来的链条作为消息的追踪。记录链条两端。 + * + * 3)对比链条两端与消息两端的对象。如果存在支撑类之类的关系,则认为消息两端应该是支撑类;否则仍保留链条两端作为消息两端。 + * + * @param res + */ + private static void traceByMsg(List res, Pair, List> objs_in_SD, Map classFullName_javaFileDir_map) { + // TODO Auto-generated method stub + } + + /** + * 保存追踪结果 + */ + public static void save(List trace_msgs, String string) { + // TODO Auto-generated method stub + + } +} diff --git a/src/main/java/com/hy/java/uct/sdtocode/mapper/DocAnalyzer.java b/src/main/java/com/hy/java/uct/sdtocode/mapper/DocAnalyzer.java index 37b8b0fbef0976fc9f1fb15eb97cac2bb92bafdb..96259ae47100b4048ea703b0f9813cb9c067bd1c 100644 --- a/src/main/java/com/hy/java/uct/sdtocode/mapper/DocAnalyzer.java +++ b/src/main/java/com/hy/java/uct/sdtocode/mapper/DocAnalyzer.java @@ -3,11 +3,13 @@ package com.hy.java.uct.sdtocode.mapper; import java.util.List; import java.util.Map; +import com.hy.java.uct.util.sd.Message; import com.hy.java.uct.util.sd.UMLObject; +import com.hy.java.utility.common.Pair; public class DocAnalyzer { - public static Map analyze(Map objs_in_SD, Map> dir_sentences_map) { + public static Map analyze(Pair, List> objs_in_SD) { // TODO Auto-generated method stub return null; } diff --git a/src/main/java/com/hy/java/uct/sdtocode/reader/DocReader.java b/src/main/java/com/hy/java/uct/sdtocode/reader/DocReader.java index 0664369faf89bcba076a19ce680667ededccc72a..e534a6d038111a5bff5dc6c0ecfc41ac42de72ad 100644 --- a/src/main/java/com/hy/java/uct/sdtocode/reader/DocReader.java +++ b/src/main/java/com/hy/java/uct/sdtocode/reader/DocReader.java @@ -1,21 +1,74 @@ package com.hy.java.uct.sdtocode.reader; -import java.util.HashMap; +import java.io.File; +import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; -import com.hy.java.utility.common.FileEditor; +import com.hy.java.uct.util.sd.Message; +import com.hy.java.uct.util.sd.UMLObject; +import com.hy.java.utility.common.Pair; + +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; public class DocReader { /** - * 读取文档信息 + * 读取类图追踪结果,将顺序图中的对象分解下去 */ - public static Map> readDocs(List doc_ls) { - Map> res = new HashMap<>(); - for (String doc_dir : doc_ls) { - FileEditor doc_file = new FileEditor(doc_dir); - res.put(doc_dir, doc_file.readLines()); + public static void readDoc(String doc_dir, Pair, List> objs_in_SD) { + // 读取类图追踪结果,将顺序图中的对象分解下去 + try { + // 工作簿 + Workbook book = Workbook.getWorkbook(new File(doc_dir)); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("Sheet1"); + int rows = sheet.getRows(); + for (int row = 1; row < rows; row++) { + // 读取类图追踪结果 + String _class = sheet.getCell(0, row).getContents(); + String code = sheet.getCell(1, row).getContents(); + // String ratio = sheet.getCell(2, row).getContents(); + if (code != null) { + // 将顺序图中的对象分解下去 + if (objs_in_SD.getLeft().containsKey(_class)) { + UMLObject obj = objs_in_SD.getLeft().get(_class); + // 分解下去 + obj.mapped_file_dir_ls.add(code); + } + } + } + book.close(); + } catch (BiffException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + // 更新Message两端的对象:将其也分解下去 + List msgs = objs_in_SD.getRight(); + for (Message msg : msgs) { + msg.source = objs_in_SD.getLeft().get(msg.source.getTitle()); + msg.target = objs_in_SD.getLeft().get(msg.target.getTitle()); + } + } + + public static void check(Pair, List> objs_in_SD) { + Map objs = objs_in_SD.getLeft(); + Set obj_names = objs.keySet(); + for (String obj_name : obj_names) { + UMLObject obj = objs.get(obj_name); + System.out.println(obj.getTitle() + "共映射到" + obj.mapped_file_dir_ls.size() + "个java文件"); + for (String dir : obj.mapped_file_dir_ls) { + System.out.println(obj.getTitle() + "映射到" + dir); + } + } + List msgs = objs_in_SD.getRight(); + int size = msgs.size(); + for (int i = 0; i < size; i++) { + Message msg = msgs.get(i); + System.out.println("第" + (i + 1) + "条消息的源" + msg.source.getTitle() + "映射到" + msg.source.mapped_file_dir_ls.size() + "个java文件,目标" + msg.target.getTitle() + "映射到" + msg.target.mapped_file_dir_ls.size() + "个java文件"); } - return res; } } diff --git a/src/main/java/com/hy/java/uct/sdtocode/reader/SDReader.java b/src/main/java/com/hy/java/uct/sdtocode/reader/SDReader.java index 1f94a181b0c8a999ef0ba7bafdaf4645a7dcb35d..fad86c1bc8d404d62d53f8aaee1702171ae77fef 100644 --- a/src/main/java/com/hy/java/uct/sdtocode/reader/SDReader.java +++ b/src/main/java/com/hy/java/uct/sdtocode/reader/SDReader.java @@ -1,52 +1,75 @@ package com.hy.java.uct.sdtocode.reader; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.Set; import com.hy.java.uct.util.sd.Message; import com.hy.java.uct.util.sd.UMLObject; import com.hy.java.utility.common.FileEditor; +import com.hy.java.utility.common.Pair; public class SDReader { /** * 读取UML图识别结果,将实体信息保存在result里并返回 */ - public static Map read(String sd_recog_res_path) { - Map result = new HashMap<>(); + public static Pair, List> read(String sd_recog_res_path) { + Pair, List> result = Pair.createPair(new HashMap(), new ArrayList<>()); // UML图识别结果 FileEditor model_file = new FileEditor(sd_recog_res_path); - // 保存对象名。对象之间以“#”隔开 - String[] obj_strs = model_file.readFileToString().split("#"); - for (String obj_str : obj_strs) { - UMLObject UML_obj = new UMLObject(); - // 消息用“@”隔开 - String[] obj_info_strs = obj_str.split("@"); - // 对象名 - UML_obj.setTitle(obj_info_strs[0].trim()); - result.put(UML_obj.getTitle(), UML_obj); + // 分隔消息。消息之间以“¥”隔开 + String[] msgs_str = model_file.readFileToString().split("¥"); + for (String msg_str : msgs_str) { + Message msg = new Message(); + String[] msg_info = msg_str.split("%"); + if (msg_info.length == 4) { + String msg_text = msg_info[0].trim(); + String msg_source_name = msg_info[1].trim(); + String msg_target_name = msg_info[2].trim(); + String msg_is_return = msg_info[3].trim(); + // 保存消息信息 + msg.msg = msg_text; + // 保存消息源 + if (!result.getLeft().containsKey(msg_source_name)) { + UMLObject obj = new UMLObject(); + obj.setTitle(msg_source_name); + result.getLeft().put(msg_source_name, obj); + } + msg.source = result.getLeft().get(msg_source_name); + // 保存消息目标 + if (!result.getLeft().containsKey(msg_target_name)) { + UMLObject obj = new UMLObject(); + obj.setTitle(msg_target_name); + result.getLeft().put(msg_target_name, obj); + } + msg.target = result.getLeft().get(msg_target_name); + // 保存消息是否为返回 + if (msg_is_return.equals("true")) { + msg.is_return = true; + } + } + result.getRight().add(msg); } - // 根据对象名保存关系(出) - for (String obj_str : obj_strs) { - String[] obj_info_strs = obj_str.split("@"); - UMLObject UML_obj = result.get(obj_info_strs[0].trim()); - // 出 - if (obj_info_strs.length >= 2) { - if (!obj_info_strs[1].isBlank()) { - // 关系用“¥”隔开 - String[] out_msgs = obj_info_strs[1].split("¥"); - for (String out_msg_str : out_msgs) { - Message msg = new Message(); - String[] msg_info = out_msg_str.split("%"); - if (msg_info.length >= 2) { - msg.source = result.get(msg_info[0].trim()); - msg.target = result.get(msg_info[1].trim()); - if (msg_info.length == 3) { - msg.msg = msg_info[2]; - } else { - msg.msg = ""; + // 将返回消息与消息配对。正向消息配一条返回消息;返回消息配一条正向消息 + List msgs = result.getRight(); + int size = msgs.size(); + // 算法:从前往后找返回消息。找到一条返回消息后,从它开始往前找第一条“目标与源正好相反”的正向消息,两者即为配对 + for (int i = 0; i < size; i++) { + if (msgs.get(i).is_return) { + // 找到一条返回消息 + Message return_msg = msgs.get(i); + // 从它开始往前找正向消息 + for (int j = i - 1; j >= 0; j--) { + if (!msgs.get(j).is_return) { + Message forward_msg = msgs.get(j); + if (forward_msg.return_msg == null) { + // 判断两条消息的目标与源是否正好相反 + if (return_msg.source.getTitle().equals(forward_msg.target.getTitle()) && return_msg.target.getTitle().equals(forward_msg.source.getTitle())) { + forward_msg.return_msg = return_msg; + return_msg.forward_msg = forward_msg; + break; } - UML_obj.out_msgs.add(msg); } } } @@ -55,15 +78,23 @@ public class SDReader { return result; } - public static void check(Map objs_in_SD) { - Set keys = objs_in_SD.keySet(); - for (String key : keys) { - UMLObject uo = objs_in_SD.get(key); - System.out.println("对象名:" + uo.getTitle()); - for (Message m : uo.out_msgs) { - System.out.println("出关:" + m.source.getTitle() + "→" + m.target.getTitle() + "说" + m.msg); + public static void check(Pair, List> objs_in_SD) { + List msgs = objs_in_SD.getRight(); + int size = msgs.size(); + for (int i = 0; i < size; i++) { + Message msg = msgs.get(i); + if (!msg.is_return) { + System.out.println("第" + (i + 1) + "条消息是正向消息,从" + msg.source.getTitle() + "→" + msg.target.getTitle() + ",消息内容是" + msg.msg); + if (msg.return_msg != null) { + System.out.println(" 第" + (i + 1) + "条消息的返回消息是" + msg.return_msg.target.getTitle() + "←" + msg.return_msg.source.getTitle() + ",说了" + msg.return_msg.msg); + } + } else { + System.out.println("第" + (i + 1) + "条消息是返回消息,这样" + msg.target.getTitle() + "←" + msg.source.getTitle() + "返回的,消息内容是" + msg.msg); + if (msg.forward_msg != null) { + System.out.println(" 第" + (i + 1) + "条消息的正向消息是" + msg.forward_msg.source.getTitle() + "→" + msg.forward_msg.target.getTitle() + ",说了" + msg.forward_msg.msg); + } } - System.out.println("======================="); + System.out.println("==================================="); } } } diff --git a/src/main/java/com/hy/java/uct/sdtocode/util/TracedMessagePath.java b/src/main/java/com/hy/java/uct/sdtocode/util/TracedMessagePath.java new file mode 100644 index 0000000000000000000000000000000000000000..aead31edb5d3daae3ab50b7628138913ebcebfaf --- /dev/null +++ b/src/main/java/com/hy/java/uct/sdtocode/util/TracedMessagePath.java @@ -0,0 +1,18 @@ +package com.hy.java.uct.sdtocode.util; + +import java.util.ArrayList; +import java.util.List; + +import com.hy.java.utility.common.Pair; + +/** + * 消息的追踪结果 + * + * 这一条图中的消息可以追踪到多条这样的链路上:→…… + * + * 每条链路有个总体的追踪概率 + */ +public class TracedMessagePath { + // 一条链路:由一系列组成的List + List> path = new ArrayList<>(); +} diff --git a/src/main/java/com/hy/java/uct/util/cd/UMLClass.java b/src/main/java/com/hy/java/uct/util/cd/UMLClass.java index 2963cb099a4bcfa39379dcc4481170b5af8176a4..398456c820c7c2495137e182e93424068eab8a24 100644 --- a/src/main/java/com/hy/java/uct/util/cd/UMLClass.java +++ b/src/main/java/com/hy/java/uct/util/cd/UMLClass.java @@ -66,6 +66,13 @@ public class UMLClass { */ public List mappedJavaFiles = new ArrayList<>(); + /** + * 通过类全称,获得类短称 + */ + public static String getClsShortNameFromFullName(String clsCode_fullName) { + return clsCode_fullName.substring(clsCode_fullName.lastIndexOf(".") + 1); + } + public String getTitle() { return title; } diff --git a/src/main/java/com/hy/java/uct/util/sd/Message.java b/src/main/java/com/hy/java/uct/util/sd/Message.java index 250880e3fea4fd1a939f036d8c5c76d744f09d40..b95b47bf497e7dcb438e20b76b09918cc91e5960 100644 --- a/src/main/java/com/hy/java/uct/util/sd/Message.java +++ b/src/main/java/com/hy/java/uct/util/sd/Message.java @@ -1,27 +1,48 @@ package com.hy.java.uct.util.sd; +import java.util.ArrayList; +import java.util.List; + +import com.hy.java.uct.sdtocode.util.TracedMessagePath; import com.hy.java.uct.umlrecog.util.Line; import com.hy.java.uct.umlrecog.util.PolygonalLine; public class Message { + /** + * 识别消息线 + */ + public Line line; + public int source_pt_index; + public PolygonalLine poly_line; /** * 标志其可能与其他msg重复 */ public boolean is_redu; - + /** + * 消息的说明、源、目标、是否为返回 + * + * 最后两个是配对的消息。正向消息配一条返回消息;返回消息配一条正向消息 + */ + public String msg; public UMLObject source; public UMLObject target; - public Line line; - public int source_pt_index; - public String msg; + public boolean is_return = false; + public Message return_msg = null; + public Message forward_msg = null; + /** + * 消息的追踪结果 + * + * 这一条图中的消息可以追踪到多条这样的链路上:→…… + * + * 每条链路有个总体的追踪概率 + */ + List traced_path_ls = new ArrayList<>(); - public PolygonalLine poly_line; + public Message() { + } public Message(Line line) { this.line = line; } - - public Message() { - } } diff --git a/src/main/java/com/hy/java/uct/util/sd/UMLObject.java b/src/main/java/com/hy/java/uct/util/sd/UMLObject.java index 075db36c51885bb4864a32847bb6df721de34314..5dcdf64bc953ed319266d6d0d8b1a724d5b27466 100644 --- a/src/main/java/com/hy/java/uct/util/sd/UMLObject.java +++ b/src/main/java/com/hy/java/uct/util/sd/UMLObject.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.List; import com.hy.java.uct.umlrecog.util.Rectangle; +import com.hy.java.uct.util.cd.UMLClass; public class UMLObject { /** @@ -17,6 +18,7 @@ public class UMLObject { private String title; public List out_msgs = new ArrayList<>(); public List in_msgs = new ArrayList<>(); + public List mapped_file_dir_ls = new ArrayList<>(); public String getTitle() { return title; diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt deleted file mode 100644 index 92b130e21fe3e51603fef0e4385eb504ae8592af..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt +++ /dev/null @@ -1,111 +0,0 @@ -we&version 0.21&依赖 -we&version 0.21&依赖 -we&hdf&依赖 -modular structure&hdf and version 0.21&AGGREGATION -we&modular structure&依赖 -we&modular structure&依赖 -we&hdf&依赖 -static analysis&source code&AGGREGATION -class and group&class ( module )&AGGREGATION -structure&figure&依赖 -[ module view&] 1.1 Modularity risk&依赖 -[ module view&] 1.1 Modularity risk&依赖 -[ module view&] 1.1 Modularity risk&依赖 -[ module view&] 1.1 Modularity risk&依赖 -[ module view&time&依赖 -it&structure& -[ module view&] 1.1 Modularity risk&依赖 -[ module view&software&依赖 -[ module view&time&依赖 -[ module view&time&依赖 -[ module view&software&依赖 -[ module view&time&依赖 -development&software&AGGREGATION -[ module view&] 1.1 Modularity risk&依赖 -[ module view&software&依赖 -[ module view&software&依赖 -[ module view&] 1.1 Modularity risk&依赖 -[ module view&] 1.1 Modularity risk&依赖 -section&characteristic&依赖 -section&four signal&依赖 -section&code&依赖 -characteristic&code&AGGREGATION -part&package&AGGREGATION -class or group&class&AGGREGATION -its&own& -class or group&more dependency ( incoming or outgoing )&依赖 -they&code&依赖 -module structure&structure&GENERALIZATION -signal&module structure&依赖 -package hdf&two&依赖 -hdfs package&code&依赖 -hdfs package&package&GENERALIZATION -hdfs.common package&package&GENERALIZATION -default port number&server.namenode and server.datanode package&依赖 -NameNode and DataNode&default port number&依赖 -server&package&依赖 -server&package& -hdfs.common instead&namenode or datanode server&依赖 -server&dependency&依赖 -1.1.1.2 hfds.security security.token.delegation.DelegationTokenSecretManager&server.namenode.FSNameSystem&依赖 -security code&code&GENERALIZATION -security code&namenode&依赖 -security code&other server&依赖 -1.1.1.3 hdfs.protocol The class blocklistaslong&server.datanode module&依赖 -1.1.1.3 hdfs.protocol The class blocklistaslong&ReplicaInfo&依赖 -hdfs.protocol&server&依赖 -1.1.1.4 hdfs.server.protocol&server.common&依赖 -protocol&defined constant&依赖 -1.1.1.4 hdfs.server.protocol&two class&依赖 -1.1.1.4 hdfs.server.protocol&two class&依赖 -1.1.1.4 hdfs.server.protocol&server.common&依赖 -they&communication&依赖 -they&communication&依赖 -they&server&依赖 -they&server&依赖 -their&use& -hdfs.server.protocol&protocol message&依赖 -its&messages& -hdfs.server.protocol&code&依赖 -hdfs.server.protocol&class&依赖 -It&protocol&依赖 -It&class&依赖 -It&dependency&依赖 -1.1.1.5 server.common IncorrectVersionException and InconsistentFSStateException&server.protocol&依赖 -function ( jsphelper.sortnodelist )&relevant&依赖 -namenode package&package&GENERALIZATION -function ( jsphelper.sortnodelist )&it&依赖 -JspHelper&namenode&依赖 -it&other server&依赖 -function ( jsphelper.sortnodelist )&namenode package&依赖 -1.1.1.6 hdfs.server.namenode server.namenode&servlet&依赖 -class namenode.FSNameSystem&multiple cyclic dependency&依赖 -It&namenode.NameNode , namenode.FSNameSystemMetrics and namenode.LeaseManager&依赖 -It&direct cyclic dependency&依赖 -1.1.1.7 hdfs.server.datanode server.datanode&hdfs.DFSClient&依赖 -1.1.1.8 hdfs.server.balancer server.balancer&hdfs.DFSClient&依赖 -possibility&dependency&依赖 -possibility&dependency&依赖 -balancer&namenode&依赖 -namenode.UnsupportedActionException&namenode and balancer namenode.Namenode&依赖 -it&port number&依赖 -namenode.UnsupportedActionException&it&依赖 -namenode&number& -block placement policy&balancer&AGGREGATION -policy&namenode&AGGREGATION -block placement policy&policy&依赖 -block placement policy&namenode&依赖 -check&protocol message&依赖 -check&server.protcol&依赖 -class server.balancer.Balancer&several cyclic dependency&依赖 -they&same source file&依赖 -dependency structure&class&AGGREGATION -effect&dependency&AGGREGATION -different component&them&依赖 -1.1.1.9 hdfs.tools tool&different component&依赖 -couple&different component&AGGREGATION -1.1.1.9 hdfs.tools tool&couple&依赖 -different component&low coupling&依赖 -main domain&a filesystem ( debugging&AGGREGATION -it&sense&依赖 -user&convenience& diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt deleted file mode 100644 index 4af331b79395fa7e7deaf99d67ab9990ab74f8b4..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt +++ /dev/null @@ -1,75 +0,0 @@ -HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW - - - -1 Module View -In this section we describe the modular structure of the HDFS, version 0.21. - -This structure is derived from a static analysis of the source code, specifically focusing on the dependencies between classes and groups of classes (modules). This structure is shown in the figure below. -[Module view] -1.1 Modularity Risks -With development of the software over time, all source code inevitably drifts away from it’s initial "as-designed" structure. This section identifies four signals--characteristics of the code--that suggest that the source code has evolved to a less modular structure: - -Module with a very weak dependency on another module: If a dependency from one module to another is very weak, it could indicate that the dependency is not well thought-out, but rather a technical debt incurred by a short-term expediency to easily implement a feature or fix a bug rather than adhering strictly to a principled architecture. -Module within a module: If a part of a package only depends on itself, and has no incoming dependencies from other classes in the package, this suggests that it is really a separate module. -Class(es) with more connections to another module than to its own: If a class or group of classes in a module has more dependencies (incoming or outgoing) to classes in another module than its own, this could be a sign that the class or group of classes could perhaps better be located in the other module, or perhaps in a separate, shared module. -Cyclic dependencies: A cyclic dependency occurs when two or more classes depend on each other either directly or indirectly. We consider cyclic dependencies a signal for refactoring opportunities, because they make it harder to understand, reuse and test the code. -When these signals are applied to the module structure, it appears that the HDFS could be made more modular. Each of these refactoring opportunities is now discussed, presented per module. - -1.1.1.1 hdfs -Because the hdfs package now contains code that is used by both the client and the server, the package hdfs should be split into two: hdfs.client and hdfs.common. The hdfs.common package can contain all code that is shared by both the client and server modules, while the client would contain just the code necessary for the client. This division could look as follows: - -hdfs.client - -hdfs.common - -BlockMissingException.java -DFSClient.java -DFSInputStream.java -DFSOutputStream.java -ByteRangeInputStream.java -DFSClient.java -HftpFileSystem.java -HsftpFileSystem.java - -BlockReader.java -DeprecatedUTF8.java -DFSConfigKeys.java -DFSUtil.java -DistributedFileSystem.java -HdfsConfiguration.java -HDFSPolicyProvider.java - - - - -Currently, the default port numbers that the NameNode and DataNode run with are stored in the server.namenode and server.datanode packages respectively. If they would be stored in hdfs.common instead, servers that want to communicate with either the namenode or datanode server would not need a dependency on that server's package. - -1.1.1.2 hfds.security -security.token.delegation.DelegationTokenSecretManager depends on server.namenode.FSNameSystem, while the security code is used by other servers than the namenode. This could be refactored so that FSNameSystem is called from the namenode rather than the security module, removing a dependency. - -1.1.1.3 hdfs.protocol -The class BlockListAsLongs depends on ReplicaInfo in the server.datanode module, which looks like an unhealthy dependency, given that hdfs.protocol is used by all servers rather than just the datanode server. Building the block list is a task that is better performed in the hdfs.server.datanode module. - -1.1.1.4 hdfs.server.protocol -The server.protocol package depends on two classes in server.common that protocol just uses for defined constants. It seems that it would be better to store these constants in the server.protocol package, as they (proven by their use in server.protocol) define the communication between servers. There are also dependencies from hdfs.server.protocol to hdfs.server.datanode (in protocol.DataNodeRegistration) and hdfs.server.namenode (in protocol.CheckpointCommand). These dependencies exist because hdfs.server.protocol contains code to fill its protocol messages from these classes. It would remove the dependencies from protocol on these classes if datanode and namenode themselves would be responsible for filling in the protocol messages. - -1.1.1.5 server.common -IncorrectVersionException and InconsistentFSStateException would probably fit better in server.protocol. JspHelper depends on namenode; the function that uses it (JspHelper.sortNodeList) can be moved to the namenode package, since it's not relevant for other servers. - -1.1.1.6 hdfs.server.namenode -server.namenode depends on hdfs.DFSClient to create servlets. It appears that this code could be refactored to be put into hdfs.common. The class namenode.FSNameSystem is involved in multiple cyclic dependencies. It has a direct cyclic dependency with namenode.NameNode, namenode.FSNameSystemMetrics and namenode.LeaseManager, and there are indirect cyclic dependencies on more classes (for example UpgradeObjectNamenode, UpgradeManagerNamenode. - -1.1.1.7 hdfs.server.datanode -server.datanode also depends on hdfs.DFSClient. Putting this code in common would be a good refactoring opportunity. - -1.1.1.8 hdfs.server.balancer -server.balancer also depends on hdfs.DFSClient, again input from the community is greatly appreciated on if putting this code in common would be a good refactoring opportunity. Another refactoring possibility for the balancer is to remove the dependency on the namenode. The classes from namenode that the balancer depends on are: - -namenode.UnsupportedActionException, which could be moved to protocol, since it's a shared message between namenode and balancer -namenode.Namenode, on which it only depends to get the namenode's port number, which could be stored in the common package. -namenode.BlockPlacementPolicy, on which it depends to check if the block placement policy of the balancer matches the policy of the namenode. This check could be done through a protocol message in server.protcol as well. -Removing the dependency on namenode would make balancer a fully separate server, and would allow it to perform at the same level as datanode and namenode. The class server.balancer.Balancer contains several cyclic dependencies, however, they are all within classes in the same source file. This means the effect of the dependencies is likely less severe, but refactoring the dependency structure of this class could still be an opportunity to increase modularity. - -1.1.1.9 hdfs.tools -tools consists of a couple of different components that have low coupling between them. But because they all provide functionality that falls somewhat outside the main domain of a filesystem (debugging and administrative tools), it makes sense to keep them together in one package for the user's convenience. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls deleted file mode 100644 index 81287be724e8c0cc41ffbf463f512afff968fb9a..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt deleted file mode 100644 index 7c96078f1a870c4b04938b20d7c9470617c594a2..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt +++ /dev/null @@ -1,359 +0,0 @@ -platform or framework&big data problem&依赖 -You&suite&依赖 -number&service&AGGREGATION -suite&( ingesting&依赖 -suite&number&依赖 -suite&service&依赖 -You&it&依赖 -Hadoop component&Hadoop ecosystem&依赖 -backbone&Hadoop Ecosystem&AGGREGATION -one&large data set&依赖 -different type&large data set&AGGREGATION -one&i.e. structured , unstructured and semus structured datum )&依赖 -one&different type&依赖 -we&whole hdf&依赖 -hdf&level&依赖 -hdf&resource&依赖 -level&abstraction&AGGREGATION -we&single unit&依赖 -hdf&abstraction&依赖 -we&whole hdf&依赖 -we&single unit&依赖 -It&us&依赖 -our&data& -hdf&two core component&依赖 -hdf&i.e. namenode&依赖 -it&actual datum&依赖 -table&content&AGGREGATION -It&metada&依赖 -you&table&依赖 -you&content&依赖 -it&less storage&依赖 -datum&DataNodes&依赖 -datum&other hand&依赖 -it&more storage resource&依赖 -your&data& -datanode&distributed environment&依赖 -datanode&laptops and desktop&依赖 -your&laptops& -datanode&distributed environment&依赖 -You&namenode while&依赖 -it&request&依赖 -it&client&依赖 -YARN&brain&依赖 -YARN&YARN&依赖 -your&Ecosystem& -YARN&Hadoop Ecosystem&依赖 -brain&Hadoop Ecosystem&AGGREGATION -your&activities& -It&processing activity&依赖 -It&two major component&依赖 -It&i.e. resource manager&依赖 -processing department&department&GENERALIZATION -Resource Manager&processing department&依赖 -It&processing request&依赖 -actual processing&place&依赖 -part&request&AGGREGATION -Node manager&Data Node&依赖 -It&execution&依赖 -execution&task&AGGREGATION -It&task&依赖 -It&single Data Node&依赖 -scheduler&scheduling algorithm&依赖 -your&requirements& -while application manager&job submission&依赖 -core component&processing&AGGREGATION -ResourceManager&two component&依赖 -logic&processing&AGGREGATION -MapReduce&other word&依赖 -application&Hadoop environment&依赖 -application&large data set&依赖 -application&distributed and parallel algorithm&依赖 -map ( )&MapReduce program&依赖 -Map function&filter , group and sort&依赖 -Map function&action&依赖 -reduce function aggregate&reduce function aggregate&依赖 -us&above example&依赖 -better understanding&MapReduce program&AGGREGATION -We&student&依赖 -sample case&student&AGGREGATION -their&departments& -We&sample case&依赖 -We&department&依赖 -number&student&AGGREGATION -We&number&依赖 -Map program&student&依赖 -Map program&appearing&依赖 -key value pair&Reduce function&依赖 -total number&student&AGGREGATION -Reduce function&department&依赖 -APACHE PIG PIG&two part&依赖 -APACHE PIG PIG&Pig Latin&依赖 -You&it&依赖 -You&Java and JVM&依赖 -It&pig latin language&依赖 -pig latin language&SQL&依赖 -pig latin language&command structure&依赖 -10 line&pig latin =&AGGREGATION -back end&Pig job&AGGREGATION -200 line&Map-Reduce Java code&AGGREGATION -compiler&MapReduce&依赖 -compiler&pig latin&依赖 -It&sequential set&依赖 -It&MapReduce job&依赖 -sequential set&MapReduce job&AGGREGATION -PIG&Yahoo&依赖 -It&platform&依赖 -It&etl ( extract&依赖 -It&data flow&依赖 -load command&datum&依赖 -load command&PIG&依赖 -we&it&依赖 -we&various function&依赖 -you&screen&依赖 -you&hdf&依赖 -you&result&依赖 -you&datum&依赖 -you&datum&依赖 -APACHE HIVE Facebook&people&依赖 -APACHE HIVE Facebook&HIVE&依赖 -&large data set&依赖 -&SQL-like interface&依赖 -&distributed environment&依赖 -query language&Hive&AGGREGATION -It&Hive Command Line and JDBC/ODBC driver&依赖 -It&2 basic component&依赖 -it&purpose&依赖 -i.e. large datum set processing&i.e. large datum set processing&依赖 -i.e. large datum set processing&i.e. large datum set processing&依赖 -i.e. large datum set processing&purpose&依赖 -i.e. large datum set processing&i.e. large datum set processing&依赖 -i.e. large datum set processing&purpose&依赖 -i.e. large datum set processing&purpose&依赖 -primitive data type&SQL&AGGREGATION -It&SQL&依赖 -It&primitive data type&依赖 -your&needs& -You&predefined function&依赖 -Machine learning algorithm&self-learning machine&依赖 -Machine learning algorithm&us&依赖 -it&important future decision&依赖 -descendant artificial intelligence ( ai )&artificial intelligence ( ai )&AGGREGATION -what mahout&what mahout&依赖 -It&collaborative filter , clustering and classification&依赖 -Mahout&function& -us&1&依赖 -us&them&依赖 -their&patterns& -their&characteristics& -It&similar group&依赖 -similar group&datum&AGGREGATION -article&research papers etc&依赖 -It&datum&依赖 -It&datum&依赖 -article&blog&依赖 -object&which&依赖 -Frequent item&mahout check&依赖 -cell phone and cover&example&依赖 -you&cell phone&依赖 -It&library&依赖 -It&predefined set&依赖 -predefined set&different use case&依赖 -predefined set&different inbuilt algorithm&依赖 -predefined set&library&AGGREGATION -APACHE SPARK Apache Spark&distributed computing environment&依赖 -APACHE SPARK Apache Spark&real time data analytic&依赖 -University and Berkeley&California&AGGREGATION -Spark&Scala&依赖 -speed&data processing&AGGREGATION -It&in-memory computation&依赖 -it&Map-Reduce&依赖 -it&high processing power&依赖 -standard library&seamless integration&依赖 -standard library&complex workflow&依赖 -various set&service&AGGREGATION -its&capabilities& -it&capability&依赖 -it&various set&依赖 -it&service&依赖 -it&integrate&依赖 -apache spark best fit&real time processing&依赖 -apache spark best fit&real time processing&依赖 -apache spark best fit&real time processing&依赖 -Spark&ability& -it&best result&依赖 -Hadoop&operation& -their&Data& -it&other word&依赖 -It&datum&依赖 -type&datum&AGGREGATION -It&type&依赖 -It&’s bigtable&依赖 -Google&BigTable& -top&hdf&AGGREGATION -It&sparse datum&依赖 -It&fault tolerant way&依赖 -HBase application&REST , Avro and Thrift api&依赖 -HBase&Java&依赖 -us&example&依赖 -you&customer&依赖 -you&number&依赖 -who&email&依赖 -who&word complaint&依赖 -number&customer&AGGREGATION -You&customer email&依赖 -billion&customer email&AGGREGATION -You&billion&依赖 -their&emails& -we&set&依赖 -small amount&datum&AGGREGATION -we&large datum&依赖 -kind&problem&AGGREGATION -kind&datum&AGGREGATION -It&open source application&依赖 -It&Google Dremel&依赖 -replica&Google Dremel&AGGREGATION -powerful feature&Drill&AGGREGATION -It&different kinds NoSQL databases and file system&依赖 -petabytes and exabyte&data efficiently&AGGREGATION -you&minute )&依赖 -we&say&依赖 -we&petabytes and exabyte&依赖 -we&data efficiently&依赖 -variety&data store&AGGREGATION -main power&Apache Drill&AGGREGATION -ANSI SQL&SQL&GENERALIZATION -Apache Drill&ANSI SQL&依赖 -million&user&AGGREGATION -It&powerful scalability factor&依赖 -their&requests& -combination&various service&AGGREGATION -APACHE ZOOKEEPER Apache Zookeeper&Hadoop job&依赖 -Hadoop job&combination&依赖 -Hadoop job&various service&依赖 -s combation of various service&Hadoop Ecosystem&AGGREGATION -Apache Zookeeper&Zookeeper&GENERALIZATION -coordinator&Hadoop job&AGGREGATION -Apache Zookeeper&various service&依赖 -Apache Zookeeper&distributed environment&依赖 -it&Zookeeper&依赖 -service&common configuration while&依赖 -service&many problem&依赖 -service&interaction&依赖 -configuration&service&AGGREGATION -It&lot&依赖 -It&time&依赖 -lot&time&AGGREGATION -it&simple service&依赖 -APACHE OOZIE&Apache Oozie&依赖 -APACHE OOZIE&Hadoop Ecosystem&依赖 -APACHE OOZIE&clock and alarm service&依赖 -Oozie&Apache job&依赖 -It&Hadoop job&依赖 -two kind&Oozie job&AGGREGATION -sequential set&action&AGGREGATION -You&it&依赖 -You&relay race&依赖 -his&part& -athlete&last one&依赖 -athlete&part&依赖 -our&body& -Oozie coordinator&same manner&依赖 -Oozie coordinator&availability&依赖 -Oozie coordinator&datum&依赖 -we&external stimulus&依赖 -availability&datum&AGGREGATION -important part&Hadoop Ecosystem&AGGREGATION -our&Ecosystem& -APACHE FLUME Ingesting datum&Hadoop Ecosystem&依赖 -collect , aggregate and move large amount&data set&AGGREGATION -It&solution&依赖 -It&online streaming datum&依赖 -It&us&依赖 -Flume agent&streaming datum&依赖 -Flume agent&various data source&依赖 -architecture&Flume&AGGREGATION -us&architecture&依赖 -Flume agent&hdf&依赖 -us&Flume&依赖 -data source&source&GENERALIZATION -you&data source&依赖 -one&famous source&AGGREGATION -Twitter&streaming datum&依赖 -Twitter&famous source&依赖 -flume agent&source , sink and channel&依赖 -flume agent&3 component&依赖 -it&incoming streamline and store&依赖 -it&datum&依赖 -it&channel&依赖 -Channel&source&依赖 -source&datum&AGGREGATION -Channel&datum&依赖 -our last component i.e. sink&our last component i.e. sink&依赖 -our&component& -apache sqoop&flume and sqoop&依赖 -apache sqoop&major difference&依赖 -Flume&unstructured datum&依赖 -Flume&hdf&依赖 -we&Sqoop command&依赖 -Sqoop&diagram&依赖 -our&task& -sub task&datum&依赖 -sub task&part&依赖 -part&datum&AGGREGATION -sub task&Hadoop Ecosystem&依赖 -Map task&whole datum&依赖 -Export&similar manner&依赖 -chunk&datum&AGGREGATION -our&Job& -Map task&datum&依赖 -it&Map task&依赖 -Map task&chunk&依赖 -Map task&hdf&依赖 -we&Job&依赖 -chunk&structured data destination&依赖 -exported chunk&datum&AGGREGATION -most&case&AGGREGATION -we&whole datum&依赖 -we&destination&依赖 -Apache Lucene&Java&依赖 -It&search and full indexing&依赖 -It&Lucene Java search library&依赖 -It&core&依赖 -It&software&依赖 -It&Apache Hadoop cluster&依赖 -number&host&AGGREGATION -It&Hadoop service&依赖 -It&step process&依赖 -It&step&依赖 -It&number&依赖 -It&configuration&依赖 -It&Hadoop service&依赖 -configuration&Hadoop service&AGGREGATION -It&Hadoop service&依赖 -It&configuration&依赖 -service&user&依赖 -your&attention& -I&attention&依赖 -Hadoop Ecosystem&many big company&依赖 -Hadoop Ecosystem&Facebook , Google , Yahoo , University&依赖 -Hadoop Ecosystem&success&依赖 -Hadoop&capabilities& -its&success& -Hadoop Ecosystem&many big company&依赖 -Hadoop Ecosystem&success&依赖 -Facebook , Google , Yahoo , University berkeley ) etc.&california (&AGGREGATION -berkeley ) etc.&part&依赖 -their&part& -Hadoop Ecosystem&Facebook , Google , Yahoo , University&依赖 -knowledge&Hadoop Ecosystem&依赖 -knowledge&Hadoop Ecosystem&依赖 -knowledge&Hadoop Ecosystem&依赖 -You&set&依赖 -You&Hadoop component&依赖 -Hadoop component&solution&依赖 -set&Hadoop component&AGGREGATION -set&service&AGGREGATION -we&service&依赖 -we&Hadoop Ecosystem&依赖 -we&set&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt deleted file mode 100644 index a93754189bc23764f68bc0be9490e4ec30a43ba8..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt +++ /dev/null @@ -1,139 +0,0 @@ -HADOOP ECOSYSTEM -Hadoop Ecosystem is neither a programming language nor a service, it is a platform or framework which solves big data problems. You can consider it as a suite which encompasses a number of services (ingesting, storing, analyzing and maintaining) inside it. Below are the Hadoop components, that together form a Hadoop ecosystem. - HDFS -> Hadoop Distributed File System - YARN -> Yet Another Resource Negotiator - MapReduce -> Data processing using programming - Spark -> In-memory Data Processing - PIG, HIVE-> Data Processing Services using Query (SQL-like) - HBase -> NoSQL Database - Mahout, Spark MLlib -> Machine Learning - Apache Drill -> SQL on Hadoop - Zookeeper -> Managing Cluster - Oozie -> Job Scheduling - Flume, Sqoop -> Data Ingesting Services - Solr& Lucene -> Searching & Indexing - Ambari -> Provision, Monitor and Maintain cluster -HDFS - Hadoop Distributed File System is the core component or you can say, the backbone of Hadoop Ecosystem. - HDFS is the one, which makes it possible to store different types of large data sets (i.e. structured, unstructured and semi structured data). - HDFS creates a level of abstraction over the resources, from where we can see the whole HDFS as a single unit. - It helps us in storing our data across various nodes and maintaining the log file about the stored data (metadata). - HDFS has two core components, i.e. NameNode and DataNode. -1. The NameNode is the main node and it doesn’t store the actual data. It contains metadata, just like a log file or you can say as a table of content. Therefore, it requires less storage and high computational resources. -2. On the other hand, all your data is stored on the DataNodes and hence it requires more storage resources. These DataNodes are commodity hardware (like your laptops and desktops) in the distributed environment. That’s the reason, why Hadoop solutions are very cost effective. -3. You always communicate to the NameNode while writing the data. Then, it internally sends a request to the client to store and replicate data on various DataNodes. -YARN -Consider YARN as the brain of your Hadoop Ecosystem. It performs all your processing activities by allocating resources and scheduling tasks. - It has two major components, i.e. Resource Manager and Node Manager. -1. Resource Manager is again a main node in the processing department. -2. It receives the processing requests, and then passes the parts of requests to corresponding Node Managers accordingly, where the actual processing takes place. -3. Node Managers are installed on every Data Node. It is responsible for execution of task on every single Data Node. - -1. Schedulers: Based on your application resource requirements, Schedulers perform scheduling algorithms and allocates the resources. -2. Applications Manager: While Applications Manager accepts the job submission, negotiates to containers (i.e. the Data node environment where process executes) for executing the application specific Application Master and monitoring the progress. ApplicationMasters are the deamons which reside on DataNode and communicates to containers for execution of tasks on each DataNode. -3. ResourceManager has two components: Schedulers and application manager -MAPREDUCE -It is the core component of processing in a Hadoop Ecosystem as it provides the logic of processing. In other words, MapReduce is a software framework which helps in writing applications that processes large data sets using distributed and parallel algorithms inside Hadoop environment. - In a MapReduce program, Map() and Reduce() are two functions. -1. The Map function performs actions like filtering, grouping and sorting. -2. While Reduce function aggregates and summarizes the result produced by map function. -3. The result generated by the Map function is a key value pair (K, V) which acts as the input for Reduce function. -Let us take the above example to have a better understanding of a MapReduce program. We have a sample case of students and their respective departments. We want to calculate the number of students in each department. Initially, Map program will execute and calculate the students appearing in each department, producing the key value pair as mentioned above. This key value pair is the input to the Reduce function. The Reduce function will then aggregate each department and calculate the total number of students in each department and produce the given result. -APACHE PIG - PIG has two parts: Pig Latin, the language and the pig runtime, for the execution environment. You can better understand it as Java and JVM. - It supports pig latin language, which has SQL like command structure. -10 line of pig latin = approx. 200 lines of Map-Reduce Java code But don’t be shocked when I say that at the back end of Pig job, a map-reduce job executes. - The compiler internally converts pig latin to MapReduce. It produces a sequential set of MapReduce jobs, and that’s an abstraction (which works like black box). - PIG was initially developed by Yahoo. - It gives you a platform for building data flow for ETL (Extract, Transform and Load), processing and analyzing huge data sets. -How Pig works? In PIG, first the load command, loads the data. Then we perform various functions on it like grouping, filtering, joining, sorting, etc. At last, either you can dump the data on the screen or you can store the result back in HDFS. -APACHE HIVE - Facebook created HIVE for people who are fluent with SQL. Thus, HIVE makes them feel at home while working in a Hadoop Ecosystem. - Basically, HIVE is a data warehousing component which performs reading, writing and managing large data sets in a distributed environment using SQL-like interface. -HIVE + SQL = HQL - The query language of Hive is called Hive Query Language(HQL), which is very similar like SQL. - It has 2 basic components: Hive Command Line and JDBC/ODBC driver. - The Hive Command line interface is used to execute HQL commands. - While, Java Database Connectivity (JDBC) and Object Database Connectivity (ODBC) is used to establish connection from data storage. - Secondly, Hive is highly scalable. As, it can serve both the purposes, i.e. large data set processing (i.e. Batch query processing) and real time processing (i.e. Interactive query processing). - It supports all primitive data types of SQL. - You can use predefined functions, or write tailored user defined functions (UDF) also to accomplish your specific needs. -APACHE MAHOUT -Now, let us talk about Mahout which is renowned for machine learning. Mahout provides an environment for creating machine learning applications which are scalable. Machine learning algorithms allow us to build self-learning machines that evolve by itself without being explicitly programmed. Based on user behaviour, data patterns and past experiences it makes important future decisions. You can call it a descendant of Artificial Intelligence (AI). What Mahout does? It performs collaborative filtering, clustering and classification. Some people also consider frequent item set missing as Mahout’s function. Let us understand them individually: -1. Collaborative filtering: Mahout mines user behaviors, their patterns and their characteristics and based on that it predicts and make recommendations to the users. The typical use case is E-commerce website. -2. Clustering: It organizes a similar group of data together like articles can contain blogs, news, research papers etc. -3. Classification: It means classifying and categorizing data into various sub-departments like articles can be categorized into blogs, news, essay, research papers and other categories. -4. Frequent item set missing: Here Mahout checks, which objects are likely to be appearing together and make suggestions, if they are missing. For example, cell phone and cover are brought together in general. So, if you search for a cell phone, it will also recommend you the cover and cases. -Mahout provides a command line to invoke various algorithms. It has a predefined set of library which already contains different inbuilt algorithms for different use cases. -APACHE SPARK - Apache Spark is a framework for real time data analytics in a distributed computing environment. - The Spark is written in Scala and was originally developed at the University of California, Berkeley. - It executes in-memory computations to increase speed of data processing over Map-Reduce. - It is 100x faster than Hadoop for large scale data processing by exploiting in-memory computations and other optimizations. Therefore, it requires high processing power than Map-Reduce. -As you can see, Spark comes packed with high-level libraries, including support for R, SQL, Python, Scala, Java etc. These standard libraries increase the seamless integrations in complex workflow. Over this, it also allows various sets of services to integrate with it like MLlib, GraphX, SQL + Data Frames, Streaming services etc. to increase its capabilities. . Apache Spark best fits for real time processing, whereas Hadoop was designed to store unstructured data and execute batch processing over it. When we combine, Apache Spark’s ability, i.e. high processing speed, advance analytics and multiple integration support with Hadoop’s low cost operation on commodity hardware, it gives the best results. -That is the reason why, Spark and Hadoop are used together by many companies for processing and analyzing their Big Data stored in HDFS. -APACHE HBASE - HBase is an open source, non-relational distributed database. In other words, it is a NoSQL database. - It supports all types of data and that is why, it’s capable of handling anything and everything inside a Hadoop ecosystem. - It is modelled after Google’s BigTable, which is a distributed storage system designed to cope up with large data sets. - The HBase was designed to run on top of HDFS and provides BigTable like capabilities. - It gives us a fault tolerant way of storing sparse data, which is common in most Big Data use cases. - The HBase is written in Java, whereas HBase applications can be written in REST, Avro and Thrift APIs. -For better understanding, let us take an example. You have billions of customer emails and you need to find out the number of customers who has used the word complaint in their emails. The request needs to be processed quickly (i.e. at real time). So, here we are handling a large data set while retrieving a small amount of data. For solving these kind of problems, HBase was designed. -APACHE DRILL -Apache Drill is used to drill into any kind of data. It’s an open source application which works with distributed environment to analyze large data sets. - It is a replica of Google Dremel. - It supports different kinds NoSQL databases and file systems, which is a powerful feature of Drill. For example: Azure Blob Storage, Google Cloud Storage, HBase, MongoDB, MapR-DB HDFS, MapR-FS, Amazon S3, Swift, NAS and local files. -So, basically the main aim behind Apache Drill is to provide scalability so that we can process petabytes and exabytes of data efficiently (or you can say in minutes). - The main power of Apache Drill lies in combining a variety of data stores just by using a single query. - Apache Drill basically follows the ANSI SQL. - It has a powerful scalability factor in supporting millions of users and serve their query requests over large scale data. -APACHE ZOOKEEPER - Apache Zookeeper is the coordinator of any Hadoop job which includes a combination of various services in a Hadoop Ecosystem. - Apache Zookeeper coordinates with various services in a distributed environment. -Before Zookeeper, it was very difficult and time consuming to coordinate between different services in Hadoop Ecosystem. The services earlier had many problems with interactions like common configuration while synchronizing data. Even if the services are configured, changes in the configurations of the services make it complex and difficult to handle. The grouping and naming was also a time-consuming factor. Due to the above problems, Zookeeper was introduced. It saves a lot of time by performing synchronization, configuration maintenance, grouping and naming. -Although it’s a simple service, it can be used to build powerful solutions. -APACHE OOZIE -Consider Apache Oozie as a clock and alarm service inside Hadoop Ecosystem. For Apache jobs, Oozie has been just like a scheduler. It schedules Hadoop jobs and binds them together as one logical work. There are two kinds of Oozie jobs: -1. Oozie workflow: These are sequential set of actions to be executed. You can assume it as a relay race. Where each athlete waits for the last one to complete his part. -2. Oozie Coordinator: These are the Oozie jobs which are triggered when the data is made available to it. Think of this as the response-stimuli system in our body. In the same manner as we respond to an external stimulus, an Oozie coordinator responds to the availability of data and it rests otherwise. -APACHE FLUME -Ingesting data is an important part of our Hadoop Ecosystem. - The Flume is a service which helps in ingesting unstructured and semi-structured data into HDFS. - It gives us a solution which is reliable and distributed and helps us in collecting, aggregating and moving large amount of data sets. - It helps us to ingest online streaming data from various sources like network traffic, social media, email messages, log files etc. in HDFS. -Now, let us understand the architecture of Flume from the below diagram: -There is a Flume agent which ingests the streaming data from various data sources to HDFS. From the diagram, you can easily understand that the web server indicates the data source. Twitter is among one of the famous sources for streaming data. The flume agent has 3 components: source, sink and channel. -1. Source: it accepts the data from the incoming streamline and stores the data in the channel. -2. Channel: it acts as the local storage or the primary storage. A Channel is a temporary storage between the source of data and persistent data in the HDFS. -3. Sink: Then, our last component i.e. Sink, collects the data from the channel and commits or writes the data in the HDFS permanently. -APACHE SQOOP -The major difference between Flume and Sqoop is that: - Flume only ingests unstructured data or semi-structured data into HDFS. - While Sqoop can import as well as export structured data from RDBMS or Enterprise data warehouses to HDFS or vice versa. -Let us understand how Sqoop works using the below diagram: -When we submit Sqoop command, our main task gets divided into sub tasks which is handled by individual Map Task internally. Map Task is the sub task, which imports part of data to the Hadoop Ecosystem. Collectively, all Map tasks imports the whole data. -Export also works in a similar manner. -When we submit our Job, it is mapped into Map Tasks which brings the chunk of data from HDFS. These chunks are exported to a structured data destination. Combining all these exported chunks of data, we receive the whole data at the destination, which in most of the cases is an RDBMS (MYSQL/Oracle/SQL Server). -APACHE SOLR & LUCENE -Apache Solr and Apache Lucene are the two services which are used for searching and indexing in Hadoop Ecosystem. - Apache Lucene is based on Java, which also helps in spell checking. - If Apache Lucene is the engine, Apache Solr is the car built around it. Solr is a complete application built around Lucene. - It uses the Lucene Java search library as a core for search and full indexing. -APACHE AMBARI -Ambari is an Apache Software Foundation Project which aims at making Hadoop ecosystem more manageable. -Big Data Hadoop Certification Training -Weekday / Weekend Batc -It includes software for provisioning, managing and monitoring Apache Hadoop clusters. The Ambari provides: -1. Hadoop cluster provisioning: - It gives us step by step process for installing Hadoop services across a number of hosts. - It also handles configuration of Hadoop services over a cluster. -2. Hadoop cluster management: - It provides a central management service for starting, stopping and re-configuring Hadoop services across the cluster. -3. Hadoop cluster monitoring: - For monitoring health and status, Ambari provides us a dashboard. - The Amber Alert framework is an alerting service which notifies the user, whenever the attention is needed. For example, if a node goes down or low disk space on a node, etc. -At last, I would like to draw your attention on three things importantly: -1. Hadoop Ecosystem owes its success to the whole developer community, many big companies like Facebook, Google, Yahoo, University of California (Berkeley) etc. have contributed their part to increase Hadoop’s capabilities. -2. Inside a Hadoop Ecosystem, knowledge about one or two tools (Hadoop components) would not help in building a solution. You need to learn a set of Hadoop components, which works together to build a solution. -3. Based on the use cases, we can choose a set of services from Hadoop Ecosystem and create a tailored solution for an organization. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls deleted file mode 100644 index 3914f2d5b9626018aa7d1e2e1c9c5e9af24a4580..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt deleted file mode 100644 index a531f53f956508fe7515e892a863d0283b38e1db..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt +++ /dev/null @@ -1,548 +0,0 @@ -The First Baby&File System Metadata The Communication Protocols Robustness Data Disk Failure&依赖 -Replica Selection Safemode The Persistence&File System Metadata The Communication Protocols Robustness Data Disk Failure&AGGREGATION -The First Baby&Replica Selection Safemode The Persistence&依赖 -It&many similarity&依赖 -It&distributed file system&依赖 -application&large data set&依赖 -hdf&few POSIX requirement&依赖 -hdf&infrastructure&依赖 -hdf&Apache Nutch web search engine project&依赖 -assumption and goal hardware failure hardware failure&exception&依赖 -HDFS instance&server machine&依赖 -hundreds or thousand&server machine&AGGREGATION -each store part&each store part&依赖 -system&data& -each store part&’s datum&依赖 -each store part&’s datum&AGGREGATION -HDFS instance&hundreds or thousand&依赖 -huge number&component&AGGREGATION -non-trivial probability&failure&AGGREGATION -component&failure&依赖 -component&non-trivial probability&依赖 -component&hdf&AGGREGATION -detection&hdf&依赖 -core architectural goal&hdf&AGGREGATION -detection&hdf&依赖 -detection&fault&AGGREGATION -their&data& -hdf&batch processing&依赖 -emphasis&low latency&依赖 -high throughput&data access&AGGREGATION -emphasis&data access&依赖 -low latency&data access&AGGREGATION -emphasis&data access&依赖 -POSIX&many hard requirement&依赖 -Large Data Sets application&large data set&依赖 -typical file&size&依赖 -typical file&size&依赖 -typical file&size&依赖 -typical file&terabyte&依赖 -typical file&terabyte&依赖 -typical file&size&依赖 -hundred&node&AGGREGATION -It&million&依赖 -It&ten&依赖 -It&file&依赖 -It&million&依赖 -It&file&依赖 -million&file&AGGREGATION -It&ten&依赖 -ten&million&AGGREGATION -Simple Coherency Model HDFS application&write-once-read-many access model&依赖 -Simple Coherency Model HDFS application&file&依赖 -assumption&data coherency issue&实现 -MapReduce application&application&GENERALIZATION -MapReduce application&model&依赖 -it&datum&依赖 -size&data set&AGGREGATION -network congestion&overall throughput&依赖 -network congestion&system&依赖 -overall throughput&system&AGGREGATION -platform&choice&AGGREGATION -widespread adoption&hdf&AGGREGATION -large set&application&AGGREGATION -master/slave architecture&architecture&GENERALIZATION -namenode and datanodes hdfs&master/slave architecture&依赖 -HDFS cluster&master server&依赖 -HDFS cluster&cluster&GENERALIZATION -HDFS cluster&single NameNode&依赖 -master server&file system namespace&依赖 -number&addition&依赖 -number&addition&依赖 -number&addition&依赖 -number&addition&依赖 -number&DataNodes&AGGREGATION -number&addition&依赖 -cluster&storage&依赖 -hdf&a file system namespace&依赖 -set&DataNodes&AGGREGATION -file&one or more block&依赖 -block&set&依赖 -block&DataNodes&依赖 -NameNode&file system namespace operation&依赖 -It&DataNodes&依赖 -mapping&block&AGGREGATION -It&mapping&依赖 -It&block&依赖 -system&clients& -DataNodes&block creation&依赖 -DataNodes&instruction&依赖 -DataNodes&NameNode&依赖 -hdf architecture namenode and datanode&software&依赖 -piece&software&AGGREGATION -machine&GNU/Linux operating system&依赖 -machine&Java&依赖 -DataNode software&software&GENERALIZATION -machine&NameNode&依赖 -Usage&portable Java language&AGGREGATION -wide range&machine&AGGREGATION -dedicated machine&machine&GENERALIZATION -NameNode software&software&GENERALIZATION -dedicated machine&NameNode software&依赖 -typical deployment&dedicated machine&依赖 -one instance&DataNode software&AGGREGATION -existence&single NameNode&AGGREGATION -existence&architecture&实现 -existence&system&实现 -architecture&system&AGGREGATION -NameNode&HDFS metada&依赖 -system&flows&依赖 -system&such a way&依赖 -user datum&NameNode&依赖 -File System Namespace hdf&traditional hierarchical file organization&依赖 -user&directory&依赖 -user&directory and store file&依赖 -one&file&依赖 -file system namespace hierarchy&most other existing file system&依赖 -hdf&user quota&实现 -hdf&hard link&依赖 -HDFS architecture&feature&实现 -HDFS architecture&architecture&GENERALIZATION -NameNode&file system namespace&依赖 -change&NameNode&依赖 -its&properties& -number&replica&AGGREGATION -application&number&依赖 -application&file&依赖 -replica&file&AGGREGATION -application&replica&依赖 -number&file&AGGREGATION -copy&file&AGGREGATION -number©&AGGREGATION -replication factor&file&AGGREGATION -information&NameNode&依赖 -It&file&依赖 -It&sequence&依赖 -sequence&block&AGGREGATION -It&block&依赖 -block&fault tolerance&依赖 -block&file&AGGREGATION -block size and replication factor&file&依赖 -replication&block&AGGREGATION -NameNode&replication&依赖 -NameNode&block&依赖 -NameNode&decision&依赖 -Receipt&Heartbeat&AGGREGATION -list&block&AGGREGATION -Blockreport&list&依赖 -Blockreport&DataNode&依赖 -Blockreport&block&依赖 -placement&replica&AGGREGATION -replica placement&hdf&依赖 -replica placement&most other distributed file system&依赖 -lot&tuning and experience&AGGREGATION -feature&lot&依赖 -feature&tuning and experience&依赖 -purpose&rack-aware replica placement policy&AGGREGATION -purpose&data reliability&依赖 -current implementation&direction&依赖 -current implementation&direction&依赖 -short-term goal&it&依赖 -its&behavior& -Large HDFS instance&cluster&依赖 -Large HDFS instance&computer&依赖 -cluster&computer&AGGREGATION -NameNode&rack id&依赖 -simple but non-optimal policy&replica&依赖 -entire rack&bandwidth&依赖 -entire rack&use&依赖 -entire rack&multiple rack&依赖 -use&bandwidth&AGGREGATION -policy&replica&依赖 -policy&cluster&依赖 -write&block&依赖 -policy&cost&依赖 -HDFS&policy& -’s placement policy&one replica&依赖 -’s placement policy&one node&依赖 -inter-rack write traffic&write performance&依赖 -policy&inter-rack write traffic&依赖 -chance&rack failure&AGGREGATION -policy&impact datum reliability and availability guarantee&依赖 -it&aggregate network bandwidth&依赖 -datum&three&依赖 -datum&two unique rack&依赖 -replica&rack&依赖 -other third&rack&依赖 -two third&replica&AGGREGATION -One third&replica&AGGREGATION -policy&performance&依赖 -current , default replica placement policy&progress&依赖 -current , default replica placement policy&progress&依赖 -hdf&replica&依赖 -Replica Selection&global bandwidth consumption&依赖 -hdf&read request&依赖 -replica&remote replica&依赖 -NameNode&special state&依赖 -NameNode&special state&依赖 -Replication&data block&AGGREGATION -NameNode&Heartbeat and Blockreport message&依赖 -NameNode&DataNodes&依赖 -Blockreport&data block&依赖 -list&data block&AGGREGATION -Blockreport&hosting&依赖 -block&replica&依赖 -block&specified minimum number&依赖 -specified minimum number&replica&AGGREGATION -data block&block&GENERALIZATION -replica&data block&AGGREGATION -minimum number&replica&AGGREGATION -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -It&list&依赖 -It&data block&依赖 -It&)&依赖 -specified number&replica&AGGREGATION -NameNode&block&依赖 -NameNode&other datanode&依赖 -HDFS namespace&NameNode&依赖 -Persistence&File System Metadata&AGGREGATION -NameNode&transaction log&依赖 -NameNode&EditLog&依赖 -NameNode&system metada&依赖 -NameNode&file&依赖 -NameNode&local host OS file system&依赖 -its&system& -entire file system namespace&file&依赖 -NameNode&system& -FsImage&’s local file system&依赖 -FsImage&file&依赖 -NameNode&memory&依赖 -NameNode&entire file system namespace and file blockmap&依赖 -image&entire file system namespace and file blockmap&AGGREGATION -4 GB&RAM&AGGREGATION -huge number&files and directory&AGGREGATION -in-memory representation&FsImage&AGGREGATION -it&FsImage and EditLog&依赖 -it&disk&依赖 -It&old EditLog&依赖 -transaction&persistent FsImage&依赖 -its&transactions& -checkpoint¤t implementation&依赖 -Work&periodic checkpointing&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -DataNode&knowledge&依赖 -DataNode&HDFS file&依赖 -It&HDFS datum&依赖 -It&block&依赖 -It&HDFS datum&依赖 -block&HDFS datum&AGGREGATION -It&block&依赖 -It&block&依赖 -It&HDFS datum&依赖 -DataNode&file&依赖 -DataNode&same directory&依赖 -optimal number&file&AGGREGATION -it&heuristic&依赖 -It&local file&依赖 -local file system&single directory&依赖 -local file system&huge number&依赖 -huge number&file&AGGREGATION -It&same directory&依赖 -list&HDFS data block&AGGREGATION -it&local file system&依赖 -Communication Protocols All HDFS communication protocol&TCP/IP protocol&依赖 -Communication Protocols All HDFS communication protocol&top&依赖 -top&TCP/IP protocol&AGGREGATION -client&configurable TCP port&依赖 -client&NameNode machine&依赖 -NameNode machine&machine&GENERALIZATION -client&connection&依赖 -It&NameNode&依赖 -It&ClientProtocol&依赖 -DataNodes&DataNode Protocol&依赖 -DataNodes&NameNode&依赖 -( rpc ) abstraction&Client Protocol&依赖 -NameNode&rpc&依赖 -NameNode&design&依赖 -it&RPC request&依赖 -robustness primary objective&hdf&AGGREGATION -robustness primary objective&datum&依赖 -presence&failure&AGGREGATION -three common type&failure&AGGREGATION -Data Disk Failure&NameNode&依赖 -Data Disk Failure&Heartbeat message&依赖 -network partition&subset&依赖 -network partition&DataNodes&依赖 -subset&DataNodes&AGGREGATION -NameNode&condition&依赖 -NameNode&absence&依赖 -NameNode&Heartbeat message&依赖 -absence&Heartbeat message&AGGREGATION -NameNode mark&recent heartbeat&依赖 -datum&hdf&依赖 -DataNode death&block&依赖 -DataNode death&replication factor&依赖 -replication factor&block&AGGREGATION -their&value& -HDFS architecture&data rebalancing scheme&依赖 -scheme&one DataNode&依赖 -scheme&datum&依赖 -free space&certain threshold&依赖 -scheme&one DataNode to ###&依赖 -free space&certain threshold&依赖 -scheme&additional replica&依赖 -scheme&additional replica&依赖 -scheme&particular file&依赖 -event&sudden high demand&AGGREGATION -scheme&sudden high demand&依赖 -type&data rebalancing scheme&AGGREGATION -block&datum&AGGREGATION -corruption&fault&依赖 -corruption&storage device&依赖 -checksum checking&HDFS file&依赖 -contents&HDFS file&AGGREGATION -checksum checking&contents&依赖 -checksum checking&HDFS file&依赖 -checksum checking&contents&依赖 -checksum checking&HDFS file&依赖 -checksum checking&contents&依赖 -HDFS file&file&GENERALIZATION -it&separate hidden file&依赖 -it&checksum&依赖 -it&block&依赖 -block&file and store&AGGREGATION -checksum&block&AGGREGATION -client&HDFS file&依赖 -it&file and store&依赖 -file contents&contents&GENERALIZATION -it&checksum&依赖 -client&file contents&依赖 -replica&block&AGGREGATION -DataNode&replica&依赖 -DataNode&block&依赖 -central data structure&hdf&AGGREGATION -Metadata Disk Failure The FsImage&hdf&依赖 -corruption&file&AGGREGATION -corruption&HDFS instance&依赖 -multiple copy&FsImage and EditLog&AGGREGATION -update&updated synchronously&依赖 -synchronous update&rate&依赖 -rate&namespace transaction&AGGREGATION -synchronous update&second&依赖 -synchronous update&namespace transaction&依赖 -synchronous update&multiple copy&AGGREGATION -it&latest consistent fsimage&依赖 -it&use&依赖 -NameNode machine&HDFS cluster&依赖 -NameNode machine&failure&依赖 -single point&failure&AGGREGATION -automatic restart and failover&NameNode software&AGGREGATION -particular instant&time&AGGREGATION -copy&datum&AGGREGATION -snapshot snapshot©&依赖 -snapshot snapshot&support&依赖 -snapshot snapshot&datum&依赖 -One usage&corrupted HDFS instance&依赖 -One usage&snapshot feature&AGGREGATION -hdf&snapshot&依赖 -application&datum&依赖 -they&one or more time&依赖 -hdf&write-once-read-many semantics&依赖 -hdf&file&依赖 -chunk&different DataNode&依赖 -HDFS client&file datum&依赖 -HDFS client&temporary local file&依赖 -HDFS client&client&GENERALIZATION -HDFS client&fact&依赖 -Application write&temporary local file&依赖 -client contact&NameNode&依赖 -local file&datum worth&依赖 -local file&one HDFS block size&依赖 -client contact&NameNode&依赖 -client contact&NameNode&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -NameNode&DataNode&依赖 -NameNode&identity&依赖 -identity&DataNode&AGGREGATION -NameNode&client request&依赖 -client&datum&依赖 -client&block&依赖 -client&datum&依赖 -client&datum&依赖 -client&block&依赖 -client&block&依赖 -un-flushed datum&DataNode&依赖 -client&NameNode&依赖 -NameNode&persistent store&依赖 -NameNode&point&依赖 -NameNode&file creation operation&依赖 -careful consideration&target application&AGGREGATION -above approach&target application&依赖 -above approach&careful consideration&依赖 -application&streaming write&依赖 -application&file&依赖 -network speed&writes&依赖 -client&client side buffering&依赖 -client&remote file&依赖 -network speed&network impact throughput&依赖 -Earlier distributed file system&client side caching&依赖 -Earlier distributed file system&client side caching&依赖 -higher performance&data upload&AGGREGATION -POSIX requirement&data upload&依赖 -POSIX requirement&higher performance&依赖 -client&datum&依赖 -client&HDFS file&依赖 -its&data& -datum&local file&依赖 -HDFS file&replication factor&依赖 -replication factor&three&AGGREGATION -HDFS file&three&依赖 -local file&user datum&依赖 -full block&user datum&AGGREGATION -local file&full block&依赖 -client&list&依赖 -list&DataNodes&AGGREGATION -client&NameNode&依赖 -DataNodes&replica&依赖 -DataNodes&block&依赖 -list&DataNodes&依赖 -client&first DataNode&依赖 -client&data block&依赖 -its&repository& -first DataNode&datum&依赖 -turn start&portion&依赖 -portion&data block&AGGREGATION -second DataNode&portion&依赖 -turn start&data block&依赖 -second DataNode&portion&依赖 -third DataNode&datum&依赖 -third DataNode&local repository&依赖 -DataNode&previous one&依赖 -DataNode&datum&依赖 -DataNode&pipeline&依赖 -datum&one DataNode&依赖 -datum&next&依赖 -Accessibility hdf&many different way&依赖 -Accessibility hdf&application&依赖 -file&HDFS instance&AGGREGATION -FS Shell HDFS&user datum&依赖 -form&files and directory&AGGREGATION -FS shell&user interact&依赖 -FS shell&datum&依赖 -syntax&command set&AGGREGATION -Action Command&directory&依赖 -txt FS shell&application&依赖 -cat / foodir/myfile&language&依赖 -Action Command&/ foodir bin/hadoop dfs&依赖 -contents&file&AGGREGATION -Action Command&Safemode bin/hadoop dfsadmin&依赖 -list&DataNodes bin/hadoop dfsadmin&AGGREGATION -refreshnodes browser interface a typical hdf&web server&依赖 -safemode&Generate&依赖 -list&refreshnodes browser interface a typical hdf&依赖 -Action Command&cluster&依赖 -its&files& -file&user&依赖 -it&hdf&依赖 -hdf first&/ trash directory&依赖 -hdf first&it&依赖 -hdf first&file&依赖 -it&/ trash&依赖 -file&configurable amount&依赖 -configurable amount&time&AGGREGATION -file&/ trash&依赖 -file&time&依赖 -expiry&life&AGGREGATION -NameNode&/ trash&依赖 -NameNode&file&依赖 -NameNode&HDFS namespace&依赖 -NameNode&file&依赖 -its&life& -deletion&block&依赖 -deletion&file&AGGREGATION -time&corresponding increase&AGGREGATION -user&file&依赖 -it&/ trash directory&依赖 -user&file&依赖 -he/she&that&依赖 -he/she&/ trash directory&依赖 -/ trash directory&file&依赖 -/ trash directory&latest copy&依赖 -latest copy&file&AGGREGATION -hdf&directory&依赖 -/ trash directory&one special feature&依赖 -hdf&policy&依赖 -hdf&file&依赖 -current default policy&/ trash&依赖 -current default policy&file&依赖 -policy&future&依赖 -policy&defined interface&依赖 -NameNode&excess replica&依赖 -next heartbeat transfer&information&依赖 -corresponding free space&cluster&依赖 -DataNode&corresponding block&依赖 -completion&setReplication API call&AGGREGATION -appearance&free space&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt deleted file mode 100644 index 811d08e8ccf6949edc37095172838b4580ae058d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt +++ /dev/null @@ -1,172 +0,0 @@ -HDFS Architecture Guide -Introduction -Assumptions and Goals -Hardware Failure -Streaming Data Access -Large Data Sets -Simple Coherency Model -“Moving Computation is Cheaper than Moving Data” -Portability Across Heterogeneous Hardware and Software Platforms -NameNode and DataNodes -The File System Namespace -Data Replication -Replica Placement: The First Baby Steps -Replica Selection -Safemode -The Persistence of File System Metadata -The Communication Protocols -Robustness -Data Disk Failure, Heartbeats and Re-Replication -Cluster Rebalancing -Data Integrity -Metadata Disk Failure -Snapshots -Data Organization -Data Blocks -Staging -Replication Pipelining -Accessibility -FS Shell -DFSAdmin -Browser Interface -Space Reclamation -File Deletes and Undeletes -Decrease Replication Factor -References -Introduction -The Hadoop Distributed File System (HDFS) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file system data. HDFS was originally built as infrastructure for the Apache Nutch web search engine project. HDFS is now an Apache Hadoop subproject. The project URL is https://hadoop.apache.org/hdfs/. - -Assumptions and Goals -Hardware Failure -Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, each storing part of the file system’s data. The fact that there are a huge number of components and that each component has a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, automatic recovery from them is a core architectural goal of HDFS. - -Streaming Data Access -Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. - -Large Data Sets -Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support tens of millions of files in a single instance. - -Simple Coherency Model -HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed. This assumption simplifies data coherency issues and enables high throughput data access. A MapReduce application or a web crawler application fits perfectly with this model. There is a plan to support appending-writes to files in the future. - -“Moving Computation is Cheaper than Moving Data” -A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. - -Portability Across Heterogeneous Hardware and Software Platforms -HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a platform of choice for a large set of applications. - -NameNode and DataNodes -HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file system’s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode. - -HDFS Architecture -The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a GNU/Linux operating system (OS). HDFS is built using the Java language; any machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case. - -The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode. - -The File System Namespace -HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and remove files, move a file from one directory to another, or rename a file. HDFS does not yet implement user quotas. HDFS does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features. - -The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode. - -Data Replication -HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence of blocks; all blocks in a file except the last block are the same size. The blocks of a file are replicated for fault tolerance. The block size and replication factor are configurable per file. An application can specify the number of replicas of a file. The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once and have strictly one writer at any time. - -The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A Blockreport contains a list of all blocks on a DataNode. - -HDFS DataNodes -Replica Placement: The First Baby Steps -The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation to test and research more sophisticated policies. - -Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines in the same rack is greater than network bandwidth between machines in different racks. - -The NameNode determines the rack id each DataNode belongs to via the process outlined in Hadoop Rack Awareness. A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of writes because a write needs to transfer blocks to multiple racks. - -For the common case, when the replication factor is three, HDFS’s placement policy is to put one replica on one node in the local rack, another on a node in a different (remote) rack, and the last on a different node in the same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability guarantees. However, it does reduce the aggregate network bandwidth used when reading data since a block is placed in only two unique racks rather than three. With this policy, the replicas of a file do not evenly distribute across the racks. One third of replicas are on one node, two thirds of replicas are on one rack, and the other third are evenly distributed across the remaining racks. This policy improves write performance without compromising data reliability or read performance. - -The current, default replica placement policy described here is a work in progress. - -Replica Selection -To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is preferred to satisfy the read request. If angg/ HDFS cluster spans multiple data centers, then a replica that is resident in the local data center is preferred over any remote replica. - -Safemode -On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block has a specified minimum number of replicas. A block is considered safely replicated when the minimum number of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified number of replicas. The NameNode then replicates these blocks to other DataNodes. - -The Persistence of File System Metadata -The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog to persistently record every change that occurs to file system metadata. For example, creating a new file in HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as a file in the NameNode’s local file system too. - -The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. This key metadata item is designed to be compact, such that a NameNode with 4 GB of RAM is plenty to support a huge number of files and directories. When the NameNode starts up, it reads the FsImage and EditLog from disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This process is called a checkpoint. In the current implementation, a checkpoint only occurs when the NameNode starts up. Work is in progress to support periodic checkpointing in the near future. - -The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files and sends this report to the NameNode: this is the Blockreport. - -The Communication Protocols -All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to a configurable TCP port on the NameNode machine. It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. A Remote Procedure Call (RPC) abstraction wraps both the Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only responds to RPC requests issued by DataNodes or clients. - -Robustness -The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types of failures are NameNode failures, DataNode failures and network partitions. - -Data Disk Failure, Heartbeats and Re-Replication -Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and does not forward any new IO requests to them. Any data that was registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a DataNode may fail, or the replication factor of a file may be increased. - -Cluster Rebalancing -The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. - -Data Integrity -It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur because of faults in a storage device, network faults, or buggy software. The HDFS client software implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, it computes a checksum of each block of the file and stores these checksums in a separate hidden file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it received from each DataNode matches the checksum stored in the associated checksum file. If not, then the client can opt to retrieve that block from another DataNode that has a replica of that block. - -Metadata Disk Failure -The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of namespace transactions per second that a NameNode can support. However, this degradation is acceptable because even though HDFS applications are very data intensive in nature, they are not metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use. - -The NameNode machine is a single point of failure for an HDFS cluster. If the NameNode machine fails, manual intervention is necessary. Currently, automatic restart and failover of the NameNode software to another machine is not supported. - -Snapshots -Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot feature may be to roll back a corrupted HDFS instance to a previously known good point in time. HDFS does not currently support snapshots but will in a future release. - -Data Organization -Data Blocks -HDFS is designed to support very large files. Applications that are compatible with HDFS are those that deal with large data sets. These applications write their data only once but they read it one or more times and require these reads to be satisfied at streaming speeds. HDFS supports write-once-read-many semantics on files. A typical block size used by HDFS is 64 MB. Thus, an HDFS file is chopped up into 64 MB chunks, and if possible, each chunk will reside on a different DataNode. - -Staging -A client request to create a file does not reach the NameNode immediately. In fact, initially the HDFS client caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy and allocates a data block for it. The NameNode responds to the client request with the identity of the DataNode and the destination data block. Then the client flushes the block of data from the local temporary file to the specified DataNode. When a file is closed, the remaining un-flushed data in the temporary local file is transferred to the DataNode. The client then tells the NameNode that the file is closed. At this point, the NameNode commits the file creation operation into a persistent store. If the NameNode dies before the file is closed, the file is lost. - -The above approach has been adopted after careful consideration of target applications that run on HDFS. These applications need streaming writes to files. If a client writes to a remote file directly without any client side buffering, the network speed and the congestion in the network impacts throughput considerably. This approach is not without precedent. Earlier distributed file systems, e.g. AFS, have used client side caching to improve performance. A POSIX requirement has been relaxed to achieve higher performance of data uploads. - -Replication Pipelining -When a client is writing data to an HDFS file, its data is first written to a local file as explained in the previous section. Suppose the HDFS file has a replication factor of three. When the local file accumulates a full block of user data, the client retrieves a list of DataNodes from the NameNode. This list contains the DataNodes that will host a replica of that block. The client then flushes the data block to the first DataNode. The first DataNode starts receiving the data in small portions (4 KB), writes each portion to its local repository and transfers that portion to the second DataNode in the list. The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from one DataNode to the next. - -Accessibility -HDFS can be accessed from applications in many different ways. Natively, HDFS provides a Java API for applications to use. A C language wrapper for this Java API is also available. In addition, an HTTP browser can also be used to browse the files of an HDFS instance. Work is in progress to expose HDFS through the WebDAV protocol. - -FS Shell -HDFS allows user data to be organized in the form of files and directories. It provides a commandline interface called FS shell that lets a user interact with the data in HDFS. The syntax of this command set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample action/command pairs: - -Action Command -Create a directory named /foodir bin/hadoop dfs -mkdir /foodir -Remove a directory named /foodir bin/hadoop dfs -rmr /foodir -View the contents of a file named /foodir/myfile.txt bin/hadoop dfs -cat /foodir/myfile.txt -FS shell is targeted for applications that need a scripting language to interact with the stored data. - -DFSAdmin -The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are used only by an HDFS administrator. Here are some sample action/command pairs: - -Action Command -Put the cluster in Safemode bin/hadoop dfsadmin -safemode enter -Generate a list of DataNodes bin/hadoop dfsadmin -report -Recommission or decommission DataNode(s) bin/hadoop dfsadmin -refreshNodes -Browser Interface -A typical HDFS install configures a web server to expose the HDFS namespace through a configurable TCP port. This allows a user to navigate the HDFS namespace and view the contents of its files using a web browser. - -Space Reclamation -File Deletes and Undeletes -When a file is deleted by a user or an application, it is not immediately removed from HDFS. Instead, HDFS first renames it to a file in the /trash directory. The file can be restored quickly as long as it remains in /trash. A file remains in /trash for a configurable amount of time. After the expiry of its life in /trash, the NameNode deletes the file from the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. Note that there could be an appreciable time delay between the time a file is deleted by a user and the time of the corresponding increase in free space in HDFS. - -A user can Undelete a file after deleting it as long as it remains in the /trash directory. If a user wants to undelete a file that he/she has deleted, he/she can navigate the /trash directory and retrieve the file. The /trash directory contains only the latest copy of the file that was deleted. The /trash directory is just like any other directory with one special feature: HDFS applies specified policies to automatically delete files from this directory. The current default policy is to delete files from /trash that are more than 6 hours old. In the future, this policy will be configurable through a well defined interface. - -Decrease Replication Factor -When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay between the completion of the setReplication API call and the appearance of free space in the cluster. - -References -HDFS Java API: https://hadoop.apache.org/core/docs/current/api/ - -HDFS source code: https://hadoop.apache.org/hdfs/version_control.html - -by Dhruba Borthakur \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls deleted file mode 100644 index 5d0278ae9431acf4a92aca1f2c1e0801518a91ea..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt deleted file mode 100644 index ab816b834fb8b000df29d5bc2a36a1c8de6a6e8d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt +++ /dev/null @@ -1,561 +0,0 @@ -It&many similarity&依赖 -It&distributed file system&依赖 -application&large data set&依赖 -hdf&few POSIX requirement&依赖 -hdf&infrastructure&依赖 -hdf&Apache Nutch web search engine project&依赖 -part&Apache Hadoop Core project&AGGREGATION -hdf&Apache Hadoop Core project&依赖 -assumption and goal hardware failure hardware failure&exception&依赖 -HDFS instance&server machine&依赖 -hundreds or thousand&server machine&AGGREGATION -each store part&each store part&依赖 -system&data& -each store part&’s datum&依赖 -each store part&’s datum&AGGREGATION -HDFS instance&hundreds or thousand&依赖 -huge number&component&AGGREGATION -non-trivial probability&failure&AGGREGATION -component&failure&依赖 -component&non-trivial probability&依赖 -component&hdf&AGGREGATION -detection&hdf&依赖 -core architectural goal&hdf&AGGREGATION -detection&hdf&依赖 -detection&fault&AGGREGATION -their&data& -hdf&batch processing&依赖 -emphasis&low latency&依赖 -high throughput&data access&AGGREGATION -emphasis&data access&依赖 -low latency&data access&AGGREGATION -emphasis&data access&依赖 -POSIX&many hard requirement&依赖 -Large Data Sets application&large data set&依赖 -typical file&size&依赖 -typical file&size&依赖 -typical file&size&依赖 -typical file&terabyte&依赖 -typical file&terabyte&依赖 -typical file&size&依赖 -hundred&node&AGGREGATION -It&million&依赖 -It&ten&依赖 -It&file&依赖 -It&million&依赖 -It&file&依赖 -million&file&AGGREGATION -It&ten&依赖 -ten&million&AGGREGATION -Simple Coherency Model HDFS application&write-once-read-many access model&依赖 -Simple Coherency Model HDFS application&file&依赖 -end&file&AGGREGATION -assumption&data coherency issue&实现 -MapReduce application&application&GENERALIZATION -MapReduce application&model&依赖 -it&datum&依赖 -size&data set&AGGREGATION -network congestion&overall throughput&依赖 -network congestion&system&依赖 -overall throughput&system&AGGREGATION -platform&choice&AGGREGATION -widespread adoption&hdf&AGGREGATION -large set&application&AGGREGATION -master/slave architecture&architecture&GENERALIZATION -namenode and datanodes hdfs&master/slave architecture&依赖 -HDFS cluster&master server&依赖 -HDFS cluster&cluster&GENERALIZATION -HDFS cluster&single NameNode&依赖 -master server&file system namespace&依赖 -number&addition&依赖 -number&addition&依赖 -number&addition&依赖 -number&addition&依赖 -number&DataNodes&AGGREGATION -number&addition&依赖 -cluster&storage&依赖 -hdf&a file system namespace&依赖 -set&DataNodes&AGGREGATION -file&one or more block&依赖 -block&set&依赖 -block&DataNodes&依赖 -NameNode&file system namespace operation&依赖 -It&DataNodes&依赖 -mapping&block&AGGREGATION -It&mapping&依赖 -It&block&依赖 -system&clients& -DataNodes&block creation&依赖 -DataNodes&instruction&依赖 -DataNodes&NameNode&依赖 -NameNode and DataNode&software&依赖 -piece&software&AGGREGATION -machine&GNU/Linux operating system&依赖 -machine&Java&依赖 -DataNode software&software&GENERALIZATION -machine&NameNode&依赖 -Usage&portable Java language&AGGREGATION -wide range&machine&AGGREGATION -dedicated machine&machine&GENERALIZATION -NameNode software&software&GENERALIZATION -dedicated machine&NameNode software&依赖 -typical deployment&dedicated machine&依赖 -one instance&DataNode software&AGGREGATION -existence&single NameNode&AGGREGATION -existence&architecture&实现 -existence&system&实现 -architecture&system&AGGREGATION -NameNode&HDFS metada&依赖 -system&flows&依赖 -system&such a way&依赖 -user datum&NameNode&依赖 -File System Namespace hdf&traditional hierarchical file organization&依赖 -user&directory&依赖 -user&directory and store file&依赖 -one&file&依赖 -file system namespace hierarchy&most other existing file system&依赖 -hdf&user quota&依赖 -hdf&hard link&依赖 -HDFS architecture&feature&实现 -HDFS architecture&architecture&GENERALIZATION -convention&FileSystem&AGGREGATION -feature&reserved path&依赖 -feature&reserved path&依赖 -NameNode&file system namespace&依赖 -change&NameNode&依赖 -its&properties& -number&replica&AGGREGATION -application&number&依赖 -application&file&依赖 -replica&file&AGGREGATION -application&replica&依赖 -number&file&AGGREGATION -copy&file&AGGREGATION -number©&AGGREGATION -replication factor&file&AGGREGATION -information&NameNode&依赖 -It&file&依赖 -It&sequence&依赖 -sequence&block&AGGREGATION -It&block&依赖 -block&fault tolerance&依赖 -block&file&AGGREGATION -block size and replication factor&file&依赖 -user&new block&依赖 -support&append and hsync&依赖 -replication&block&AGGREGATION -NameNode&replication&依赖 -NameNode&block&依赖 -NameNode&decision&依赖 -Receipt&Heartbeat&AGGREGATION -list&block&AGGREGATION -Blockreport&list&依赖 -Blockreport&DataNode&依赖 -Blockreport&block&依赖 -placement&replica&AGGREGATION -replica placement&hdf&依赖 -replica placement&most other distributed file system&依赖 -lot&tuning and experience&AGGREGATION -feature&lot&依赖 -feature&tuning and experience&依赖 -purpose&rack-aware replica placement policy&AGGREGATION -purpose&data reliability&依赖 -current implementation&direction&依赖 -current implementation&direction&依赖 -short-term goal&it&依赖 -its&behavior& -Large HDFS instance&cluster&依赖 -Large HDFS instance&computer&依赖 -cluster&computer&AGGREGATION -NameNode&rack id&依赖 -simple but non-optimal policy&replica&依赖 -entire rack&bandwidth&依赖 -entire rack&use&依赖 -entire rack&multiple rack&依赖 -use&bandwidth&AGGREGATION -policy&replica&依赖 -policy&cluster&依赖 -write&block&依赖 -policy&cost&依赖 -HDFS&policy& -writer&same rack as ###&依赖 -’s placement policy&one replica&依赖 -writer&random datanode&依赖 -writer&same rack&依赖 -inter-rack write traffic&write performance&依赖 -policy&inter-rack write traffic&依赖 -chance&rack failure&AGGREGATION -policy&impact datum reliability and availability guarantee&依赖 -it&aggregate network bandwidth&依赖 -datum&three&依赖 -datum&two unique rack&依赖 -replica&block&AGGREGATION -replica&rack&依赖 -Two replica&one rack&依赖 -one&other rack&AGGREGATION -replica&one&依赖 -different node&one rack&AGGREGATION -replica&other rack&依赖 -node&one&AGGREGATION -policy&performance&依赖 -placement&4th and following replica&AGGREGATION -maximum number&replica&AGGREGATION -maximum number&datanode&依赖 -maximum number&time&依赖 -multiple replica&same block&AGGREGATION -NameNode&same block&依赖 -maximum number&time&依赖 -maximum number&time&依赖 -total number&datanode&AGGREGATION -maximum number&time&依赖 -maximum number&datanode&依赖 -NameNode&DataNodes&依赖 -maximum number&time&依赖 -maximum number&datanode&依赖 -NameNode&multiple replica&依赖 -maximum number&time&依赖 -NameNode&addition&依赖 -NameNode&account&依赖 -support&hdf&依赖 -NameNode&rack awareness&依赖 -NameNode&policy&依赖 -NameNode&policy&依赖 -NameNode&node&依赖 -NameNode&node&依赖 -NameNode&node&依赖 -candidate node&storage&依赖 -candidate node&node&GENERALIZATION -NameNode&node&依赖 -candidate node&storage type&依赖 -NameNode&second path&依赖 -enough node&first path&依赖 -NameNode&fallback storage type&依赖 -current , default replica placement policy&progress&依赖 -current , default replica placement policy&progress&依赖 -hdf&replica&依赖 -Replica Selection&global bandwidth consumption&依赖 -hdf&read request&依赖 -HDFS cluster&multiple data center&依赖 -replica&remote replica&依赖 -Additional&4 different pluggable Block Placement policy&依赖 -their&infrastructre& -user&policy&依赖 -default hdf&BlockPlacementPolicyDefault&依赖 -default hdf&default hdf&依赖 -NameNode&special state&依赖 -NameNode&special state&依赖 -Replication&data block&AGGREGATION -NameNode&Heartbeat and Blockreport message&依赖 -NameNode&DataNodes&依赖 -Blockreport&data block&依赖 -list&data block&AGGREGATION -Blockreport&hosting&依赖 -block&replica&依赖 -block&specified minimum number&依赖 -specified minimum number&replica&AGGREGATION -data block&block&GENERALIZATION -replica&data block&AGGREGATION -minimum number&replica&AGGREGATION -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&namenode (&依赖 -namenode exit&Safemode state&依赖 -namenode exit&safely replicate datum block check&依赖 -namenode exit&additional 30 second&依赖 -namenode exit&Safemode state&依赖 -It&list&依赖 -It&data block&依赖 -It&)&依赖 -specified number&replica&AGGREGATION -NameNode&block&依赖 -NameNode&other datanode&依赖 -HDFS namespace&NameNode&依赖 -Persistence&File System Metadata&AGGREGATION -NameNode&transaction log&依赖 -NameNode&EditLog&依赖 -NameNode&system metada&依赖 -NameNode&file&依赖 -NameNode&local host OS file system&依赖 -its&system& -entire file system namespace&file&依赖 -NameNode&system& -FsImage&’s local file system&依赖 -FsImage&file&依赖 -NameNode&memory&依赖 -NameNode&entire file system namespace and file blockmap&依赖 -image&entire file system namespace and file blockmap&AGGREGATION -checkpoint&configurable threshold&依赖 -in-memory representation&FsImage&AGGREGATION -it&FsImage and EditLog&依赖 -it&disk&依赖 -It&old EditLog&依赖 -transaction&persistent FsImage&依赖 -its&transactions& -purpose&checkpoint&AGGREGATION -hdf&file system metada&依赖 -consistent view&file system metada&AGGREGATION -hdf&consistent view&依赖 -snapshot&file system metada&AGGREGATION -it&incremental edit&依赖 -it&FsImage&依赖 -we&edit&依赖 -we&Editlog&依赖 -change&checkpoint&依赖 -change&FsImage&依赖 -given number&filesystem transaction&AGGREGATION -given number&( dfs.namenode.checkpoint.txns )&依赖 -first threshold&checkpoint&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -DataNode&knowledge&依赖 -DataNode&HDFS file&依赖 -It&HDFS datum&依赖 -It&block&依赖 -It&HDFS datum&依赖 -block&HDFS datum&AGGREGATION -It&block&依赖 -It&block&依赖 -It&HDFS datum&依赖 -DataNode&file&依赖 -DataNode&same directory&依赖 -optimal number&file&AGGREGATION -it&heuristic&依赖 -It&local file&依赖 -local file system&file&依赖 -local file system&huge number&依赖 -huge number&file&AGGREGATION -It&same directory&依赖 -local file system&single directory&依赖 -list&HDFS data block&AGGREGATION -it&local file system&依赖 -Communication Protocols All HDFS communication protocol&TCP/IP protocol&依赖 -Communication Protocols All HDFS communication protocol&top&依赖 -top&TCP/IP protocol&AGGREGATION -client&configurable TCP port&依赖 -client&NameNode machine&依赖 -NameNode machine&machine&GENERALIZATION -client&connection&依赖 -It&NameNode&依赖 -It&ClientProtocol&依赖 -DataNodes&DataNode Protocol&依赖 -DataNodes&NameNode&依赖 -( rpc ) abstraction&Client Protocol&依赖 -NameNode&rpc&依赖 -NameNode&design&依赖 -it&RPC request&依赖 -robustness primary objective&hdf&AGGREGATION -robustness primary objective&datum&依赖 -presence&failure&AGGREGATION -three common type&failure&AGGREGATION -Data Disk Failure&NameNode&依赖 -Data Disk Failure&Heartbeat message&依赖 -network partition&subset&依赖 -network partition&DataNodes&依赖 -subset&DataNodes&AGGREGATION -NameNode&condition&依赖 -NameNode&absence&依赖 -NameNode&Heartbeat message&依赖 -absence&Heartbeat message&AGGREGATION -NameNode mark&recent heartbeat&依赖 -datum&hdf&依赖 -DataNode death&block&依赖 -DataNode death&replication factor&依赖 -replication factor&block&AGGREGATION -their&value& -state flap&DataNodes&AGGREGATION -user&shorter interval&依赖 -HDFS architecture&data rebalancing scheme&依赖 -scheme&one DataNode&依赖 -scheme&datum&依赖 -free space&certain threshold&依赖 -scheme&one DataNode to ###&依赖 -free space&certain threshold&依赖 -scheme&additional replica&依赖 -scheme&additional replica&依赖 -scheme&particular file&依赖 -event&sudden high demand&AGGREGATION -scheme&sudden high demand&依赖 -type&data rebalancing scheme&AGGREGATION -block&datum&AGGREGATION -corruption&fault&依赖 -corruption&storage device&依赖 -checksum checking&HDFS file&依赖 -contents&HDFS file&AGGREGATION -checksum checking&contents&依赖 -checksum checking&HDFS file&依赖 -checksum checking&contents&依赖 -checksum checking&HDFS file&依赖 -checksum checking&contents&依赖 -HDFS file&file&GENERALIZATION -it&separate hidden file&依赖 -it&checksum&依赖 -it&block&依赖 -block&file and store&AGGREGATION -checksum&block&AGGREGATION -client&HDFS file&依赖 -it&file and store&依赖 -file contents&contents&GENERALIZATION -it&checksum&依赖 -client&file contents&依赖 -DataNode&replica&依赖 -DataNode&block&依赖 -central data structure&hdf&AGGREGATION -Metadata Disk Failure The FsImage&hdf&依赖 -corruption&file&AGGREGATION -corruption&HDFS instance&依赖 -multiple copy&FsImage and EditLog&AGGREGATION -update&updated synchronously&依赖 -synchronous update&rate&依赖 -rate&namespace transaction&AGGREGATION -synchronous update&support&依赖 -synchronous update&multiple copy&AGGREGATION -it&latest consistent fsimage&依赖 -it&use&依赖 -High Availability&shared storage&依赖 -High Availability&nf&依赖 -High Availability&multiple namenode&依赖 -particular instant&time&AGGREGATION -copy&datum&AGGREGATION -snapshot snapshot©&依赖 -snapshot snapshot&support&依赖 -snapshot snapshot&datum&依赖 -One usage&corrupted HDFS instance&依赖 -One usage&snapshot feature&AGGREGATION -application&datum&依赖 -they&one or more time&依赖 -hdf&write-once-read-many semantics&依赖 -hdf&file&依赖 -chunk&different DataNode&依赖 -NameNode&list&依赖 -NameNode&datanode&依赖 -Replication Pipelining&replication factor&依赖 -client&datum&依赖 -client&three&依赖 -NameNode&a replication target choose algorithm&依赖 -client&datum&依赖 -client&replication factor&依赖 -Replication Pipelining&three&依赖 -replication factor&three&AGGREGATION -list&datanode&AGGREGATION -DataNodes&replica&依赖 -DataNodes&block&依赖 -list&DataNodes&依赖 -client&first DataNode&依赖 -its&repository& -first DataNode&datum&依赖 -turn start&portion&依赖 -portion&data block&AGGREGATION -second DataNode&portion&依赖 -turn start&data block&依赖 -second DataNode&portion&依赖 -third DataNode&datum&依赖 -third DataNode&local repository&依赖 -DataNode&previous one&依赖 -DataNode&datum&依赖 -DataNode&pipeline&依赖 -datum&one DataNode&依赖 -datum&next&依赖 -Accessibility hdf&many different way&依赖 -Accessibility hdf&application&依赖 -file&HDFS instance&AGGREGATION -hdf&part&依赖 -client&system& -hdf&’s local file system&依赖 -part&’s local file system&AGGREGATION -FS Shell HDFS&user datum&依赖 -form&files and directory&AGGREGATION -FS shell&user interact&依赖 -FS shell&datum&依赖 -syntax&command set&AGGREGATION -Action Command&directory&依赖 -Action Command&/ foodir bin/hadoop fs&依赖 -txt FS shell&application&依赖 -cat / foodir/myfile&language&依赖 -contents&file&AGGREGATION -report Recommission or decommission DataNode&) bin/hdfs dfsadmin&依赖 -refreshnodes browser interface a typical hdf&HDFS namespace&依赖 -list&DataNodes bin/hdfs dfsadmin&AGGREGATION -refreshnodes browser interface a typical hdf&web server&依赖 -Action Command&cluster&依赖 -Action Command&Safemode bin/hdfs dfsadmin&依赖 -its&files& -file&hdf&依赖 -user&/ user / /&依赖 -its&directory& -hdf&a trash directory (&依赖 -user&own trash directory&依赖 -hdf&it&依赖 -it&trash&依赖 -Most recent deleted file¤t trash directory ( / user / /&依赖 -hdf&checkpoint&依赖 -hdf&( under / user&依赖 -checkpointing&trash&AGGREGATION -expunge command&FS shell&AGGREGATION -expiry&life&AGGREGATION -NameNode&file&依赖 -NameNode&HDFS namespace&依赖 -NameNode&file&依赖 -its&life& -NameNode&trash&依赖 -deletion&block&依赖 -deletion&file&AGGREGATION -file&user&依赖 -time&corresponding increase&AGGREGATION -We&test1 & test2 )&依赖 -We&2 file test1 & test2 )&依赖 -We&file test1&依赖 -we&file&依赖 -Trash/Current&hdf&依赖 -skipTrash option&file&依赖 -we&skipTrash option&依赖 -skipTrash option&Trash.It&依赖 -We&file test1&依赖 -NameNode&excess replica&依赖 -next heartbeat transfer&information&依赖 -corresponding free space&cluster&依赖 -DataNode&corresponding block&依赖 -completion&setReplication API call&AGGREGATION -appearance&free space&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt deleted file mode 100644 index c812cce02a7b16e36284136575d0f9b8172d71ea..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt +++ /dev/null @@ -1,359 +0,0 @@ -General -Overview -Single Node Setup -Cluster Setup -Commands Reference -FileSystem Shell -Compatibility Specification -Downstream Developer's Guide -Admin Compatibility Guide -Interface Classification -FileSystem Specification -Common -CLI Mini Cluster -Fair Call Queue -Native Libraries -Proxy User -Rack Awareness -Secure Mode -Service Level Authorization -HTTP Authentication -Credential Provider API -Hadoop KMS -Tracing -Unix Shell Guide -Registry -HDFS -Architecture -User Guide -Commands Reference -NameNode HA With QJM -NameNode HA With NFS -Observer NameNode -Federation -ViewFs -ViewFsOverloadScheme -Snapshots -Edits Viewer -Image Viewer -Permissions and HDFS -Quotas and HDFS -libhdfs (C API) -WebHDFS (REST API) -HttpFS -Short Circuit Local Reads -Centralized Cache Management -NFS Gateway -Rolling Upgrade -Extended Attributes -Transparent Encryption -Multihoming -Storage Policies -Memory Storage Support -Synthetic Load Generator -Erasure Coding -Disk Balancer -Upgrade Domain -DataNode Admin -Router Federation -Provided Storage -MapReduce -Tutorial -Commands Reference -Compatibility with 1.x -Encrypted Shuffle -Pluggable Shuffle/Sort -Distributed Cache Deploy -Support for YARN Shared Cache -MapReduce REST APIs -MR Application Master -MR History Server -YARN -Architecture -Commands Reference -Capacity Scheduler -Fair Scheduler -ResourceManager Restart -ResourceManager HA -Resource Model -Node Labels -Node Attributes -Web Application Proxy -Timeline Server -Timeline Service V.2 -Writing YARN Applications -YARN Application Security -NodeManager -Running Applications in Docker Containers -Running Applications in runC Containers -Using CGroups -Secure Containers -Reservation System -Graceful Decommission -Opportunistic Containers -YARN Federation -Shared Cache -Using GPU -Using FPGA -Placement Constraints -YARN UI2 -YARN REST APIs -Introduction -Resource Manager -Node Manager -Timeline Server -Timeline Service V.2 -YARN Service -Overview -QuickStart -Concepts -Yarn Service API -Service Discovery -System Services -Hadoop Compatible File Systems -Aliyun OSS -Amazon S3 -Azure Blob Storage -Azure Data Lake Storage -OpenStack Swift -Tencent COS -Auth -Overview -Examples -Configuration -Building -Tools -Hadoop Streaming -Hadoop Archives -Hadoop Archive Logs -DistCp -GridMix -Rumen -Resource Estimator Service -Scheduler Load Simulator -Hadoop Benchmarking -Dynamometer -Reference -Changelog and Release Notes -Java API docs -Unix Shell API -Metrics -Configuration -core-default.xml -hdfs-default.xml -hdfs-rbf-default.xml -mapred-default.xml -yarn-default.xml -kms-default.xml -httpfs-default.xml -Deprecated Properties -Built by Maven -HDFS Architecture -Introduction -Assumptions and Goals -Hardware Failure -Streaming Data Access -Large Data Sets -Simple Coherency Model -“Moving Computation is Cheaper than Moving Data” -Portability Across Heterogeneous Hardware and Software Platforms -NameNode and DataNodes -The File System Namespace -Data Replication -Replica Placement: The First Baby Steps -Replica Selection -Block Placement Policies -Safemode -The Persistence of File System Metadata -The Communication Protocols -Robustness -Data Disk Failure, Heartbeats and Re-Replication -Cluster Rebalancing -Data Integrity -Metadata Disk Failure -Snapshots -Data Organization -Data Blocks -Replication Pipelining -Accessibility -FS Shell -DFSAdmin -Browser Interface -Space Reclamation -File Deletes and Undeletes -Decrease Replication Factor -References -Introduction -The Hadoop Distributed File System (HDFS) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file system data. HDFS was originally built as infrastructure for the Apache Nutch web search engine project. HDFS is part of the Apache Hadoop Core project. The project URL is http://hadoop.apache.org/. - -Assumptions and Goals -Hardware Failure -Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, each storing part of the file system’s data. The fact that there are a huge number of components and that each component has a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, automatic recovery from them is a core architectural goal of HDFS. - -Streaming Data Access -Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. - -Large Data Sets -Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support tens of millions of files in a single instance. - -Simple Coherency Model -HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed except for appends and truncates. Appending the content to the end of the files is supported but cannot be updated at arbitrary point. This assumption simplifies data coherency issues and enables high throughput data access. A MapReduce application or a web crawler application fits perfectly with this model. - -“Moving Computation is Cheaper than Moving Data” -A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. - -Portability Across Heterogeneous Hardware and Software Platforms -HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a platform of choice for a large set of applications. - -NameNode and DataNodes -HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file system’s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode. - -HDFS Architecture - -The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a GNU/Linux operating system (OS). HDFS is built using the Java language; any machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case. - -The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode. - -The File System Namespace -HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and remove files, move a file from one directory to another, or rename a file. HDFS supports user quotas and access permissions. HDFS does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features. - -While HDFS follows naming convention of the FileSystem, some paths and names (e.g. /.reserved and .snapshot ) are reserved. Features such as transparent encryption and snapshot use reserved paths. - -The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode. - -Data Replication -HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence of blocks. The blocks of a file are replicated for fault tolerance. The block size and replication factor are configurable per file. - -All blocks in a file except the last block are the same size, while users can start a new block without filling out the last block to the configured block size after the support for variable length block was added to append and hsync. - -An application can specify the number of replicas of a file. The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once (except for appends and truncates) and have strictly one writer at any time. - -The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A Blockreport contains a list of all blocks on a DataNode. - -HDFS DataNodes - -Replica Placement: The First Baby Steps -The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation to test and research more sophisticated policies. - -Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines in the same rack is greater than network bandwidth between machines in different racks. - -The NameNode determines the rack id each DataNode belongs to via the process outlined in Hadoop Rack Awareness. A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of writes because a write needs to transfer blocks to multiple racks. - -For the common case, when the replication factor is three, HDFS’s placement policy is to put one replica on the local machine if the writer is on a datanode, otherwise on a random datanode in the same rack as that of the writer, another replica on a node in a different (remote) rack, and the last on a different node in the same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability guarantees. However, it does not reduce the aggregate network bandwidth used when reading data since a block is placed in only two unique racks rather than three. With this policy, the replicas of a block do not evenly distribute across the racks. Two replicas are on different nodes of one rack and the remaining replica is on a node of one of the other racks. This policy improves write performance without compromising data reliability or read performance. - -If the replication factor is greater than 3, the placement of the 4th and following replicas are determined randomly while keeping the number of replicas per rack below the upper limit (which is basically (replicas - 1) / racks + 2). - -Because the NameNode does not allow DataNodes to have multiple replicas of the same block, maximum number of replicas created is the total number of DataNodes at that time. - -After the support for Storage Types and Storage Policies was added to HDFS, the NameNode takes the policy into account for replica placement in addition to the rack awareness described above. The NameNode chooses nodes based on rack awareness at first, then checks that the candidate node have storage required by the policy associated with the file. If the candidate node does not have the storage type, the NameNode looks for another node. If enough nodes to place replicas can not be found in the first path, the NameNode looks for nodes having fallback storage types in the second path. - -The current, default replica placement policy described here is a work in progress. - -Replica Selection -To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is preferred to satisfy the read request. If HDFS cluster spans multiple data centers, then a replica that is resident in the local data center is preferred over any remote replica. - -Block Placement Policies -As mentioned above when the replication factor is three, HDFS’s placement policy is to put one replica on the local machine if the writer is on a datanode, otherwise on a random datanode in the same rack as that of the writer, another replica on a node in a different (remote) rack, and the last on a different node in the same remote rack. If the replication factor is greater than 3, the placement of the 4th and following replicas are determined randomly while keeping the number of replicas per rack below the upper limit (which is basically (replicas - 1) / racks + 2). Additional to this HDFS supports 4 different pluggable Block Placement Policies. Users can choose the policy based on their infrastructre and use case. By default HDFS supports BlockPlacementPolicyDefault. - -Safemode -On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block has a specified minimum number of replicas. A block is considered safely replicated when the minimum number of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified number of replicas. The NameNode then replicates these blocks to other DataNodes. - -The Persistence of File System Metadata -The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog to persistently record every change that occurs to file system metadata. For example, creating a new file in HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as a file in the NameNode’s local file system too. - -The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. When the NameNode starts up, or a checkpoint is triggered by a configurable threshold, it reads the FsImage and EditLog from disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This process is called a checkpoint. The purpose of a checkpoint is to make sure that HDFS has a consistent view of the file system metadata by taking a snapshot of the file system metadata and saving it to FsImage. Even though it is efficient to read a FsImage, it is not efficient to make incremental edits directly to a FsImage. Instead of modifying FsImage for each edit, we persist the edits in the Editlog. During the checkpoint the changes from Editlog are applied to the FsImage. A checkpoint can be triggered at a given time interval (dfs.namenode.checkpoint.period) expressed in seconds, or after a given number of filesystem transactions have accumulated (dfs.namenode.checkpoint.txns). If both of these properties are set, the first threshold to be reached triggers a checkpoint. - -The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files, and sends this report to the NameNode. The report is called the Blockreport. - -The Communication Protocols -All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to a configurable TCP port on the NameNode machine. It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. A Remote Procedure Call (RPC) abstraction wraps both the Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only responds to RPC requests issued by DataNodes or clients. - -Robustness -The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types of failures are NameNode failures, DataNode failures and network partitions. - -Data Disk Failure, Heartbeats and Re-Replication -Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and does not forward any new IO requests to them. Any data that was registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a DataNode may fail, or the replication factor of a file may be increased. - -The time-out to mark DataNodes dead is conservatively long (over 10 minutes by default) in order to avoid replication storm caused by state flapping of DataNodes. Users can set shorter interval to mark DataNodes as stale and avoid stale nodes on reading and/or writing by configuration for performance sensitive workloads. - -Cluster Rebalancing -The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. - -Data Integrity -It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur because of faults in a storage device, network faults, or buggy software. The HDFS client software implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, it computes a checksum of each block of the file and stores these checksums in a separate hidden file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it received from each DataNode matches the checksum stored in the associated checksum file. If not, then the client can opt to retrieve that block from another DataNode that has a replica of that block. - -Metadata Disk Failure -The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of namespace transactions per second that a NameNode can support. However, this degradation is acceptable because even though HDFS applications are very data intensive in nature, they are not metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use. - -Another option to increase resilience against failures is to enable High Availability using multiple NameNodes either with a shared storage on NFS or using a distributed edit log (called Journal). The latter is the recommended approach. - -Snapshots -Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot feature may be to roll back a corrupted HDFS instance to a previously known good point in time. - -Data Organization -Data Blocks -HDFS is designed to support very large files. Applications that are compatible with HDFS are those that deal with large data sets. These applications write their data only once but they read it one or more times and require these reads to be satisfied at streaming speeds. HDFS supports write-once-read-many semantics on files. A typical block size used by HDFS is 128 MB. Thus, an HDFS file is chopped up into 128 MB chunks, and if possible, each chunk will reside on a different DataNode. - -Replication Pipelining -When a client is writing data to an HDFS file with a replication factor of three, the NameNode retrieves a list of DataNodes using a replication target choosing algorithm. This list contains the DataNodes that will host a replica of that block. The client then writes to the first DataNode. The first DataNode starts receiving the data in portions, writes each portion to its local repository and transfers that portion to the second DataNode in the list. The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from one DataNode to the next. - -Accessibility -HDFS can be accessed from applications in many different ways. Natively, HDFS provides a FileSystem Java API for applications to use. A C language wrapper for this Java API and REST API is also available. In addition, an HTTP browser and can also be used to browse the files of an HDFS instance. By using NFS gateway, HDFS can be mounted as part of the client’s local file system. - -FS Shell -HDFS allows user data to be organized in the form of files and directories. It provides a commandline interface called FS shell that lets a user interact with the data in HDFS. The syntax of this command set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample action/command pairs: - -Action Command -Create a directory named /foodir bin/hadoop dfs -mkdir /foodir -Remove a directory named /foodir bin/hadoop fs -rm -R /foodir -View the contents of a file named /foodir/myfile.txt bin/hadoop dfs -cat /foodir/myfile.txt -FS shell is targeted for applications that need a scripting language to interact with the stored data. - -DFSAdmin -The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are used only by an HDFS administrator. Here are some sample action/command pairs: - -Action Command -Put the cluster in Safemode bin/hdfs dfsadmin -safemode enter -Generate a list of DataNodes bin/hdfs dfsadmin -report -Recommission or decommission DataNode(s) bin/hdfs dfsadmin -refreshNodes -Browser Interface -A typical HDFS install configures a web server to expose the HDFS namespace through a configurable TCP port. This allows a user to navigate the HDFS namespace and view the contents of its files using a web browser. - -Space Reclamation -File Deletes and Undeletes -If trash configuration is enabled, files removed by FS Shell is not immediately removed from HDFS. Instead, HDFS moves it to a trash directory (each user has its own trash directory under /user//.Trash). The file can be restored quickly as long as it remains in trash. - -Most recent deleted files are moved to the current trash directory (/user//.Trash/Current), and in a configurable interval, HDFS creates checkpoints (under /user//.Trash/) for files in current trash directory and deletes old checkpoints when they are expired. See expunge command of FS shell about checkpointing of trash. - -After the expiry of its life in trash, the NameNode deletes the file from the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. Note that there could be an appreciable time delay between the time a file is deleted by a user and the time of the corresponding increase in free space in HDFS. - -Following is an example which will show how the files are deleted from HDFS by FS Shell. We created 2 files (test1 & test2) under the directory delete - -$ hadoop fs -mkdir -p delete/test1 -$ hadoop fs -mkdir -p delete/test2 -$ hadoop fs -ls delete/ -Found 2 items -drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:39 delete/test1 -drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:40 delete/test2 -We are going to remove the file test1. The comment below shows that the file has been moved to Trash directory. - -$ hadoop fs -rm -r delete/test1 -Moved: hdfs://localhost:8020/user/hadoop/delete/test1 to trash at: hdfs://localhost:8020/user/hadoop/.Trash/Current -now we are going to remove the file with skipTrash option, which will not send the file to Trash.It will be completely removed from HDFS. - -$ hadoop fs -rm -r -skipTrash delete/test2 -Deleted delete/test2 -We can see now that the Trash directory contains only file test1. - -$ hadoop fs -ls .Trash/Current/user/hadoop/delete/ -Found 1 items\ -drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:39 .Trash/Current/user/hadoop/delete/test1 -So file test1 goes to Trash and file test2 is deleted permanently. - -Decrease Replication Factor -When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay between the completion of the setReplication API call and the appearance of free space in the cluster. - -References -Hadoop JavaDoc API. - -HDFS source code: http://hadoop.apache.org/version_control.html \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls deleted file mode 100644 index 9018f53759f73b1f95a44b900321c7e50d248ccf..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS-relation.txt deleted file mode 100644 index a41386e40f8f90ec0b8a15261f81c4fec2597db8..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS-relation.txt +++ /dev/null @@ -1,204 +0,0 @@ -number&separate machine&AGGREGATION -dataset&single physical machine&依赖 -storage capacity&single physical machine&AGGREGATION -dataset&storage capacity&依赖 -it&it&依赖 -network&machine&AGGREGATION -separate machine&storage&依赖 -Hadoop&distributed filesystem&依赖 -Design&hdf&AGGREGATION -cluster&commodity hardware&AGGREGATION -“ Very large ”&file&依赖 -hundred&megabytes , gigabytes , or terabyte&AGGREGATION -“ Very large ”&file&依赖 -store petabyte&datum&AGGREGATION -Hadoop cluster&store petabyte&依赖 -Hadoop cluster&datum&依赖 -hdf&idea&依赖 -dataset&source&依赖 -various analysis&dataset over time&依赖 -Hadoop&expensive , highly reliable hardware&依赖 -Hadoop&run&依赖 -chance&large cluster&依赖 -chance&which&依赖 -cluster&commodity hardware ( commonly available hardware&AGGREGATION -chance&which&依赖 -chance&node failure&AGGREGATION -chance&large cluster&依赖 -chance&large cluster&依赖 -chance&which&依赖 -face&such failure&AGGREGATION -ten&milliseconds range&AGGREGATION -namenode&filesystem metada&依赖 -lot&small file&AGGREGATION -limit&namenode&依赖 -limit&amount&依赖 -number&file&AGGREGATION -amount&memory&AGGREGATION -namenode&memory&依赖 -file&single writer&依赖 -write&end&依赖 -write&file&依赖 -end&file&AGGREGATION -hdf&concept&依赖 -hdf&block&依赖 -concept&block&AGGREGATION -file&block-sized chunk&依赖 -several benefits.&block abstraction&依赖 -several benefits.&distributed filesystem&依赖 -’s nothing&block&依赖 -’s nothing&a file&依赖 -they&advantage&依赖 -they&advantage of ###&依赖 -unit&abstraction a block&AGGREGATION -storage subsystem deal&storage management (&实现 -storage subsystem deal&storage management (&实现 -block&replication&依赖 -block&separate machine&依赖 -small number&separate machine and typically three )&AGGREGATION -block&small number&依赖 -block&typically three )&依赖 -reason&seek&依赖 -cost&seek&AGGREGATION -reason&cost&依赖 -time&block&依赖 -time&start&依赖 -start&block&AGGREGATION -time&disk transfer rate&依赖 -time&disk transfer rate&依赖 -time&disk transfer rate&依赖 -seek time&transfer time&AGGREGATION -many HDFS installation&128 MB block&依赖 -transfer speed&disk drive&依赖 -transfer speed&new generation&依赖 -new generation&disk drive&AGGREGATION -HDFS cluster&node operating&依赖 -HDFS cluster&namenode&依赖 -HDFS cluster&master-worker pattern&依赖 -number&datanode ( worker&AGGREGATION -two type&node operating&AGGREGATION -HDFS cluster&cluster&GENERALIZATION -HDFS cluster&two type&依赖 -namenode&filesystem namespace&依赖 -It&filesystem tree&依赖 -information&form&依赖 -information&local disk&依赖 -information&two file&依赖 -information&namespace image&依赖 -form&two file&AGGREGATION -block&datanode&依赖 -namenode&datanode&依赖 -Hadoop cluster&cluster&GENERALIZATION -one primary component&hadoop cluster and hdf&AGGREGATION -TaskTraker } hdf&hadoop cluster and hdf&依赖 -mapping&block&AGGREGATION -master ( namenode )&file system namespace operation&依赖 -system&clients& -file system&system&GENERALIZATION -datanode&filesystem&依赖 -workhorse&filesystem&AGGREGATION -list&block&AGGREGATION -they&block&依赖 -They&block&依赖 -they&list&依赖 -they&storing&依赖 -what precaution hdf&file system&依赖 -persistent state&filesystem metada&AGGREGATION -what precaution hdf&what precaution hdf&依赖 -namenode failure&persistent state&依赖 -namenode failure&filesystem metada&依赖 -case&namenode failure&AGGREGATION -first way&file&依赖 -namenode&persistent state&依赖 -namenode&multiple filesystem&依赖 -its&state& -It&secondary namenode&依赖 -its&name& -main role&namespace image&依赖 -Its&role& -namenode&file and block&依赖 -namenode&filesystem&依赖 -namenode&reference&依赖 -namenode&memory&依赖 -portion&filesystem namespace&AGGREGATION -HDFS Federation&cluster&依赖 -HDFS Federation&scale&依赖 -filesystem namespace&namespace&GENERALIZATION -HDFS Federation&cluster&依赖 -HDFS Federation&scale&依赖 -one namenode&file&依赖 -one namenode&file&依赖 -second namenode&/ share&依赖 -second namenode&file&依赖 -namenode&one another&依赖 -failure&one namenode&AGGREGATION -availability&namespace&AGGREGATION -failure&namespace&依赖 -failure&availability&依赖 -so datanodes register&multiple block pool&依赖 -so datanodes register&multiple block pool&依赖 -so datanodes register&namenode&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&multiple block pool&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&multiple block pool&依赖 -so datanodes register&namenode&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&namenode&依赖 -so datanodes register&namenode&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&cluster and store block&依赖 -so datanodes register&cluster and store block&依赖 -sole repository&metada&AGGREGATION -clients —&list file&依赖 -clients —&list file&依赖 -namenode&metada&依赖 -single point failure ( spof )&failure ( spof )&AGGREGATION -namenode&failure ( spof )&依赖 -whole Hadoop system&event&依赖 -administrator&new primary namenode&依赖 -administrator&filesystem metadata replica&依赖 -one&filesystem metadata replica&AGGREGATION -administrator&one&依赖 -new namenode&request&依赖 -its&image& -its&log& -ii )&block report&依赖 -it&memory&依赖 -it&namespace image&依赖 -it&namenode&依赖 -time&large cluster&依赖 -time&large cluster&依赖 -time&many files and block&依赖 -time&many files and block&依赖 -0.23 release series&situation&依赖 -0.23 release series&hadoop remedy&AGGREGATION -pair&implementation&依赖 -pair&implementation&依赖 -pair&implementation&依赖 -pair&implementation&依赖 -pair&namenode&AGGREGATION -standby&duty&依赖 -its&duties& -failure&active namenode&AGGREGATION -event&failure&AGGREGATION -namenode&highly-available shared storage&依赖 -namenode&memory& -datanode&namenode&依赖 -block mapping&’s memory&依赖 -datanode&block report&依赖 -namenode failover&mechanism&依赖 -transition&system&依赖 -transition&new entity&依赖 -first implementation&ZooKeeper&依赖 -case&routine maintenance&AGGREGATION -Failover&adminstrator&依赖 -Failover&routine maintenance&依赖 -Failover&example&依赖 -Failover&case&依赖 -failover controller&role&依赖 -failover controller&orderly transition&依赖 -failover controller&both namenode&依赖 -case&ungraceful failover&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt deleted file mode 100644 index 3853475f4a577008101cc19227770dc4ab4c9757..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt +++ /dev/null @@ -1,36 +0,0 @@ -HDFS -When a dataset outgrows the storage capacity of a single physical machine, it becomes necessary to partition it across a number of separate machines. Filesystems that manage the storage across a network of machines are called distributed filesystems. -Hadoop comes with a distributed filesystem called HDFS, which stands for Hadoop Distributed Filesystem. -The Design of HDFS : -HDFS is a filesystem designed for storing very large files with streaming data access patterns, running on clusters of commodity hardware. -Very large files: -“Very large” in this context means files that are hundreds of megabytes, gigabytes, or terabytes in size. There are Hadoop clusters running today that store petabytes of data. -Streaming data access : -HDFS is built around the idea that the most efficient data processing pattern is a write-once, read-many-times pattern. A dataset is typically generated or copied from source, then various analyses are performed on that dataset over time. -Commodity hardware : -Hadoop doesn’t require expensive, highly reliable hardware to run on. It’s designed to run on clusters of commodity hardware (commonly available hardware available from multiple vendors3) for which the chance of node failure across the cluster is high, at least for large clusters. HDFS is designed to carry on working without a noticeable interruption to the user in the face of such failure. -These are areas where HDFS is not a good fit today: -Low-latency data access : -Applications that require low-latency access to data, in the tens of milliseconds range, will not work well with HDFS. -Lots of small files : -Since the namenode holds filesystem metadata in memory, the limit to the number of files in a filesystem is governed by the amount of memory on the namenode. -Multiple writers, arbitrary file modifications: -Files in HDFS may be written to by a single writer. Writes are always made at the end of the file. There is no support for multiple writers, or for modifications at arbitrary offsets in the file. -HDFS Concepts -Blocks: -HDFS has the concept of a block, but it is a much larger unit—64 MB by default. Files in HDFS are broken into block-sized chunks, which are stored as independent units. -Having a block abstraction for a distributed filesystem brings several benefits.: -The first benefit : -A file can be larger than any single disk in the network. There’s nothing that requires the blocks from a file to be stored on the same disk, so they can take advantage of any of the disks in the cluster. -Second: -Making the unit of abstraction a block rather than a file simplifies the storage subsystem. The storage subsystem deals with blocks, simplifying storage management (since blocks are a fixed size, it is easy to calculate how many can be stored on a given disk) and eliminating metadata concerns. -Third: -Blocks fit well with replication for providing fault tolerance and availability. To insure against corrupted blocks and disk and machine failure, each block is replicated to a small number of physically separate machines (typically three). -Why Is a Block in HDFS So Large? -HDFS blocks are large compared to disk blocks, and the reason is to minimize the cost of seeks. By making a block large enough, the time to transfer the data from the disk can be made to be significantly larger than the time to seek to the start of the block. Thus the time to transfer a large file made of multiple blocks operates at the disk transfer rate. -A quick calculation shows that if the seek time is around 10 ms, and the transfer rate is 100 MB/s, then to make the seek time 1% of the transfer time, we need to make the block size around 100 MB. The default is actually 64 MB, although many HDFS installations use 128 MB blocks. This figure will continue to be revised upward as transfer speeds grow with new generations of disk drives. -Namenodes and Datanodes: -An HDFS cluster has two types of node operating in a master-worker pattern: a namenode (the master) and a number of datanodes (workers). The namenode manages the filesystem namespace. It maintains the filesystem tree and the metadata for all the files and directories in the tree. This information is stored persistently on the local disk in the form of two files: the namespace image and the edit log. The namenode also knows the datanodes on which all the blocks for a given file are located. -Apache Hadoop is designed to have Master Slave architecture: Master: Namenode, JobTracker Slave: {DataNode, TaskTraker}, ….. {DataNode, TaskTraker} HDFS is one primary components of Hadoop cluster and HDFS is designed to have Master-slave architecture. Master: NameNode Slave: {Datanode}…..{Datanode} - The Master (NameNode) manages the file system namespace operations like opening, closing, and renaming files and directories and determines the mapping of blocks to DataNodes along with regulating access to files by clients - Slaves (DataNodes) are responsible for serving read and write requests from the file system’s clients along with perform block creation, deletion, and replication upon instruction from the Master (NameNode). Datanodes are the workhorses of the filesystem. They store and retrieve blocks when they are told to (by clients or the namenode), and they report back to the namenode periodically with lists of blocks that they are storing. NameNode failure: if the machine running the namenode failed, all the files on the filesystem would be lost since there would be no way of knowing how to reconstruct the files from the blocks on the datanodes. What precautions HDFS is taking to recover file system in case of namenode failure: The first way is to back up the files that make up the persistent state of the filesystem metadata. Hadoop can be configured so that the namenode writes its persistent state to multiple filesystems. These writes are synchronous and atomic. The usual configuration choice is to write to local disk as well as a remote NFS mount. -Second way: It is also possible to run a secondary namenode, which despite its name does not act as a namenode. Its main role is to periodically merge the namespace image with the edit log to prevent the edit log from becoming too large. But this can shaped to act as primary namenode. HDFS Federation : The namenode keeps a reference to every file and block in the filesystem in memory, which means that on very large clusters with many files, memory becomes the limiting factor for scaling . HDFS Federation, introduced in the 0.23 release series, allows a cluster to scale by adding namenodes, each of which manages a portion of the filesystem namespace. For example, one namenode might manage all the files rooted under /user, say, and a second namenode might handle files under /share. Each Namenode Namespace volumes are independent of each other, which means namenodes do not communicate with one another, and furthermore the failure of one namenode does not affect the availability of the namespaces managed by other namenodes. Block pool storage is not partitioned, however, so datanodes register with each namenode in the cluster and store blocks from multiple block pools. HDFS High-Availability: The namenode is still a single point of failure (SPOF), since if it did fail, all clients—including MapReduce jobs—would be unable to read, write, or list files, because the namenode is the sole repository of the metadata and the file-to-block mapping. In such an event the whole Hadoop system would effectively be out of service until a new namenode could be brought online. To recover from a failed namenode in this situation, an administrator starts a new primary namenode with one of the filesystem metadata replicas, and configures datanodes and clients to use this new namenode. The new namenode is not able to serve requests until it has i) loaded its namespace image into memory, ii) replayed its edit log, and iii) received enough block reports from the datanodes to leave safe mode. On large clusters with many files and blocks, the time it takes for a namenode to start from cold can be 30 minutes or more. The 0.23 release series of Hadoop remedies this situation by adding support for HDFS high-availability (HA). In this implementation there is a pair of namenodes in an activestandby configuration. In the event of the failure of the active namenode, the standby takes over its duties to continue servicing client requests without a significant interruption. A few architectural changes are needed to allow this to happen:  The namenodes must use highly-available shared storage to share the edit log. - Datanodes must send block reports to both namenodes since the block mappings are stored in a namenode’s memory, and not on disk.  Clients must be configured to handle namenode failover, which uses a mechanism that is transparent to users. Failover and fencing: The transition from the active namenode to the standby is managed by a new entity in the system called the failover controller. Failover controllers are pluggable, but the first implementation uses ZooKeeper to ensure that only one namenode is active. Failover may also be initiated manually by an adminstrator, in the case of routine maintenance, for example. This is known as a graceful failover, since the failover controller arranges an orderly transition for both namenodes to switch roles. In the case of an ungraceful failover, The HA implementation goes to great lengths to ensure that the previously active namenode is prevented from doing any damage and causing corruption—a method known as fencing. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls deleted file mode 100644 index a11c1e626af77bcce485b28b67ae14d5153560ec..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls and /dev/null differ diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" deleted file mode 100644 index 7849dd744808c551c989ad4c8feb73c2f7b9f3f4..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" +++ /dev/null @@ -1,258 +0,0 @@ -file system ( hdf ) architecture –&Guide&依赖 -every datum engineer download share aniruddha bhandari — october 28 , 2020 beginner big datum data engineering hadoop overview&Components&依赖 -every datum engineer download share aniruddha bhandari — october 28 , 2020 beginner big datum data engineering hadoop overview&HDFS&依赖 -Components&HDFS&AGGREGATION -massive amount&datum&AGGREGATION -your&Tweet& -you&next Tweet&依赖 -your&message& -you&step&依赖 -you&technology&依赖 -you&datum&依赖 -you&datum&依赖 -it&single machine&依赖 -you&lovely 3 AM tweets * cough *&依赖 -your&*& -I&thinking& -you&storage component&依赖 -you&Hadoop&依赖 -you&file system ( hdf&依赖 -your&side& -storage component&Hadoop&AGGREGATION -you&amazing power&依赖 -you&file system ( hdf&依赖 -you&Hadoop&依赖 -you&storage component&依赖 -you&amazing power&依赖 -you&storage component&依赖 -amazing power&file system ( hdf&AGGREGATION -you&Hadoop&依赖 -you&amazing power&依赖 -you&file system ( hdf&依赖 -It&Hadoop&依赖 -most important component&Hadoop&AGGREGATION -its&components& -we&article&依赖 -file system ( hdf&what&依赖 -table&Contents&AGGREGATION -component&hdf&AGGREGATION -HDFS Replication Management Replication&Blocks&AGGREGATION -huge volume&datum&AGGREGATION -It&datum&依赖 -It&huge volume&依赖 -It&single machine&依赖 -it&datum&依赖 -multiple machine&storage&依赖 -network&machine&AGGREGATION -file system ( hdf )&Hadoop&依赖 -datum&machine&依赖 -datum&distributed manner&依赖 -cluster&machine&AGGREGATION -datum&cluster&依赖 -few property&existence&依赖 -its&existence& -it&few property&依赖 -petabyte&datum&AGGREGATION -philosophy&most effective data processing pattern&依赖 -It&philosophy&依赖 -Cost-effective – hdf&commodity hardware&依赖 -cluster&commodity hardware&AGGREGATION -Cost-effective – hdf&cluster&依赖 -component&file system ( hdf )&AGGREGATION -hdf&two main component&依赖 -– data blocks and node&data block&依赖 -hdf break&file&依赖 -hdf break&file&依赖 -it&them&依赖 -smaller unit&hdf&依赖 -you&it&依赖 -size&default&依赖 -size&default&依赖 -you&requirement&依赖 -file&size 512MB&AGGREGATION -you&file&依赖 -you&size 512MB&依赖 -it&128MB each&依赖 -it&4 block&依赖 -file&size 524MB&AGGREGATION -you&size 524MB&依赖 -it&5 block&依赖 -4&128MB each&依赖 -5th&12MB&依赖 -last block&disk&依赖 -last block&complete 128MB&依赖 -multiple block&10KB&AGGREGATION -amount&petra byte&依赖 -we&Hadoop&依赖 -we&amount&依赖 -we&datum&依赖 -order&petra byte&AGGREGATION -amount&petra byte&依赖 -amount&datum&AGGREGATION -we&block&依赖 -we&small size&依赖 -colossal number&block&AGGREGATION -block&small size&AGGREGATION -block&lot&依赖 -block&overhead&依赖 -location&block&AGGREGATION -lot&overhead&AGGREGATION -it&it&依赖 -choke&single machine&AGGREGATION -It&proper spread&依赖 -It&workload&依赖 -proper spread&workload&AGGREGATION -they&block&依赖 -Namenode&master-worker architecture&依赖 -Namenode&master-worker architecture&依赖 -filesystem tree or hierarchy&files and directory&AGGREGATION -owner&file&AGGREGATION -It&file&依赖 -It&location&依赖 -block&file&AGGREGATION -It&block&依赖 -their&size& -information&two file&依赖 -information&form&依赖 -information&Fsimage&依赖 -information&local disk&依赖 -form&two file&AGGREGATION -fsimage store&information&依赖 -fsimage store&filesystem&依赖 -fsimage store&information&依赖 -fsimage store&filesystem&依赖 -fsimage store&information&依赖 -fsimage store&filesystem&依赖 -fsimage store&filesystem&依赖 -fsimage store&filesystem&依赖 -fsimage store&information&依赖 -fsimage store&information&依赖 -fsimage store&information&依赖 -fsimage store&filesystem&依赖 -it&replication level&依赖 -it&file&依赖 -their&sizes& -it&directory&依赖 -it&modification time and permission&依赖 -Edit log&write operation&依赖 -Edit log&track&依赖 -client&that&依赖 -track&write operation&AGGREGATION -Edit log&write operation&依赖 -Edit log&track&依赖 -client&hdf&依赖 -it&Namenode&依赖 -client&information&依赖 -Namenode&block&依赖 -Namenode&location&依赖 -Namenode&location&依赖 -Namenode&block&依赖 -Namenode&block&依赖 -datanode&deletion , etc.&依赖 -datanode&block&依赖 -They&Namenode&依赖 -their&health& -They&heartbeat&依赖 -it&health&依赖 -Namenode&block&依赖 -list&block&AGGREGATION -mapping&block&AGGREGATION -Namenode&mapping&依赖 -Namenode&block&依赖 -Namenode&mapping&依赖 -DataNode&list&依赖 -Namenode&block&依赖 -DataNode&block&依赖 -Namenode&mapping&依赖 -its&memory& -node&addition&依赖 -node&node&依赖 -node&node&依赖 -node&cluster&依赖 -node&addition&依赖 -node&cluster&依赖 -two type&node&AGGREGATION -node&two type&依赖 -node&two type&依赖 -case&failure&AGGREGATION -latest copy&Edit Log&AGGREGATION -we&Edit Log&依赖 -we&latest copy&依赖 -track&transaction&AGGREGATION -we&long time&依赖 -Edit log&size&依赖 -we&node&依赖 -lot&time&AGGREGATION -filesystem&time&依赖 -we&Secondary Namenode&依赖 -Secondary Namenode&Namenode&GENERALIZATION -check‐points&’s in-memory file system metada&AGGREGATION -whose main task&Edit log&依赖 -primary&metadata& -Secondary Namenode&cluster&依赖 -whose&task& -lot&memory&AGGREGATION -Secondary namenode&separate node&依赖 -Secondary namenode&cluster&依赖 -Secondary Namenode&Namenode&依赖 -Secondary Namenode&name&依赖 -its&name& -It&Checkpointing&依赖 -copy&latest Fsimage&AGGREGATION -replication&block&AGGREGATION -one&HDFS&依赖 -one&block&依赖 -one&block&依赖 -one&best feature&AGGREGATION -one&block&依赖 -best feature&hdf&AGGREGATION -one&HDFS&依赖 -one&HDFS&依赖 -it&them&依赖 -it&block&依赖 -’s&question&依赖 -reliable storage component&Hadoop&AGGREGATION -Replication&blocks hdf&AGGREGATION -Replication&Hadoop&依赖 -Replication&Hadoop&依赖 -block&cluster&依赖 -block&different Data node&依赖 -two more copy&it&AGGREGATION -we&much storage&依赖 -5 block&128MB each&AGGREGATION -we&128MB each&依赖 -we&5 block&依赖 -more&machine&AGGREGATION -We&cluster&依赖 -do namenode&replica&依赖 -we&Rack&依赖 -we&look&依赖 -we&Hadoop&依赖 -Rack&machine&依赖 -collection&machine&AGGREGATION -Rack&30-40&依赖 -Rack&hadoop )&依赖 -Rack awareness Replica storage&reliability and read/write bandwidth&依赖 -we&fault tolerance&依赖 -replica&same node&依赖 -Hadoop&deal&依赖 -Hadoop&default strategy&依赖 -first replica&example&依赖 -client&same Datanode&依赖 -first replica&same Datanode&依赖 -second replica&different Datanode&依赖 -third replica&different Datanode&依赖 -third replica&same rack&依赖 -third replica&second&依赖 -subsequent replica&random Data node&依赖 -subsequent replica&cluster&依赖 -I&solid understanding&依赖 -it&datum&依赖 -I&what&依赖 -file system ( hdf )&what&依赖 -better understanding&Hadoop&AGGREGATION -I&Hadoop&依赖 -Definitive Guide&Guide&GENERALIZATION -I&Definitive Guide&依赖 -MapReduce Types&Tables&AGGREGATION -article&it&依赖 diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" deleted file mode 100644 index 34635b3559f623c0dba0fdb73f24adecf4d9fa6d..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" +++ /dev/null @@ -1,161 +0,0 @@ -Hadoop Distributed File System (HDFS) Architecture – A Guide to HDFS for Every Data Engineer -download -Share -Aniruddha Bhandari — October 28, 2020 -Beginner Big data Data Engineering Hadoop -Overview -Get familiar with Hadoop Distributed File System (HDFS) -Understand the Components of HDFS - - -Introduction -In contemporary times, it is commonplace to deal with massive amounts of data. From your next WhatsApp message to your next Tweet, you are creating data at every step when you interact with technology. Now multiply that by 4.5 billion people on the internet – the math is simply mind-boggling! - -But ever wondered how to handle such data? Is it stored on a single machine? What if the machine fails? Will you lose your lovely 3 AM tweets *cough*? - - - -The answer is No. I am pretty sure you are already thinking about Hadoop. Hadoop is an amazing framework. With Hadoop by your side, you can leverage the amazing powers of Hadoop Distributed File System (HDFS)-the storage component of Hadoop. It is probably the most important component of Hadoop and demands a detailed explanation. - -So, in this article, we will learn what Hadoop Distributed File System (HDFS) really is and about its various components. Also, we will see what makes HDFS tick – that is what makes it so special. Let’s find out! - - - -Table of Contents -What is Hadoop Distributed File System (HDFS)? -What are the components of HDFS? -Blocks in HDFS? -Namenode in HDFS -Datanodes in HDFS -Secondary Node in HDFS -Replication Management -Replication of Blocks -What is a Rack in Hadoop? -Rack Awareness - - -What is Hadoop Distributed File System(HDFS)? -It is difficult to maintain huge volumes of data in a single machine. Therefore, it becomes necessary to break down the data into smaller chunks and store it on multiple machines. - -Filesystems that manage the storage across a network of machines are called distributed file systems. - -Hadoop Distributed File System (HDFS) is the storage component of Hadoop. All data stored on Hadoop is stored in a distributed manner across a cluster of machines. But it has a few properties that define its existence. - -Huge volumes – Being a distributed file system, it is highly capable of storing petabytes of data without any glitches. -Data access – It is based on the philosophy that “the most effective data processing pattern is write-once, the read-many-times pattern”. -Cost-effective – HDFS runs on a cluster of commodity hardware. These are inexpensive machines that can be bought from any vendor. - - -What are the components of the Hadoop Distributed File System(HDFS)? -HDFS has two main components, broadly speaking, – data blocks and nodes storing those data blocks. But there is more to it than meets the eye. So, let’s look at this one by one to get a better understanding. - -HDFS Blocks -HDFS breaks down a file into smaller units. Each of these units is stored on different machines in the cluster. This, however, is transparent to the user working on HDFS. To them, it seems like storing all the data onto a single machine. - -These smaller units are the blocks in HDFS. The size of each of these blocks is 128MB by default, you can easily change it according to requirement. So, if you had a file of size 512MB, it would be divided into 4 blocks storing 128MB each. - -hadoop hdfs blocks - -If, however, you had a file of size 524MB, then, it would be divided into 5 blocks. 4 of these would store 128MB each, amounting to 512MB. And the 5th would store the remaining 12MB. That’s right! This last block won’t take up the complete 128MB on the disk. - -hadoop hdfs blocks split - -But, you must be wondering, why such a huge amount in a single block? Why not multiple blocks of 10KB each? Well, the amount of data with which we generally deal with in Hadoop is usually in the order of petra bytes or higher. - -Therefore, if we create blocks of small size, we would end up with a colossal number of blocks. This would mean we would have to deal with equally large metadata regarding the location of the blocks which would just create a lot of overhead. And we don’t really want that! - -There are several perks to storing data in blocks rather than saving the complete file. - -The file itself would be too large to store on any single disk alone. Therefore, it is prudent to spread it across different machines on the cluster. -It would also enable a proper spread of the workload and prevent the choke of a single machine by taking advantage of parallelism. -Now, you must be wondering, what about the machines in the cluster? How do they store the blocks and where is the metadata stored? Let’s find out. - - - -Namenode in HDFS -HDFS operates in a master-worker architecture, this means that there are one master node and several worker nodes in the cluster. The master node is the Namenode. - -Namenode is the master node that runs on a separate node in the cluster. - -Manages the filesystem namespace which is the filesystem tree or hierarchy of the files and directories. -Stores information like owners of files, file permissions, etc for all the files. -It is also aware of the locations of all the blocks of a file and their size. -All this information is maintained persistently over the local disk in the form of two files: Fsimage and Edit Log. - -Fsimage stores the information about the files and directories in the filesystem. For files, it stores the replication level, modification and access times, access permissions, blocks the file is made up of, and their sizes. For directories, it stores the modification time and permissions. -Edit log on the other hand keeps track of all the write operations that the client performs. This is regularly updated to the in-memory metadata to serve the read requests. -Whenever a client wants to write information to HDFS or read information from HDFS, it connects with the Namenode. The Namenode returns the location of the blocks to the client and the operation is carried out. - -Yes, that’s right, the Namenode does not store the blocks. For that, we have separate nodes. - - - -Datanodes in HDFS -Datanodes are the worker nodes. They are inexpensive commodity hardware that can be easily added to the cluster. - -Datanodes are responsible for storing, retrieving, replicating, deletion, etc. of blocks when asked by the Namenode. - -They periodically send heartbeats to the Namenode so that it is aware of their health. With that, a DataNode also sends a list of blocks that are stored on it so that the Namenode can maintain the mapping of blocks to Datanodes in its memory. - -But in addition to these two types of nodes in the cluster, there is also another node called the Secondary Namenode. Let’s look at what that is. - - - -Secondary Namenode in HDFS -Suppose we need to restart the Namenode, which can happen in case of a failure. This would mean that we have to copy the Fsimage from disk to memory. Also, we would also have to copy the latest copy of Edit Log to Fsimage to keep track of all the transactions. But if we restart the node after a long time, then the Edit log could have grown in size. This would mean that it would take a lot of time to apply the transactions from the Edit log. And during this time, the filesystem would be offline. Therefore, to solve this problem, we bring in the Secondary Namenode. - -Secondary Namenode is another node present in the cluster whose main task is to regularly merge the Edit log with the Fsimage and produce check‐points of the primary’s in-memory file system metadata. This is also referred to as Checkpointing. - -hadop hdfs checkpointing - - - -But the checkpointing procedure is computationally very expensive and requires a lot of memory, which is why the Secondary namenode runs on a separate node on the cluster. - -However, despite its name, the Secondary Namenode does not act as a Namenode. It is merely there for Checkpointing and keeping a copy of the latest Fsimage. - - - -Replication Management in HDFS -Now, one of the best features of HDFS is the replication of blocks which makes it very reliable. But how does it replicate the blocks and where does it store them? Let’s answer those questions now. - - - -Replication of blocks -HDFS is a reliable storage component of Hadoop. This is because every block stored in the filesystem is replicated on different Data Nodes in the cluster. This makes HDFS fault-tolerant. - -The default replication factor in HDFS is 3. This means that every block will have two more copies of it, each stored on separate DataNodes in the cluster. However, this number is configurable. - -hadoop hdfs replication - - - -But you must be wondering doesn’t that mean that we are taking up too much storage. For instance, if we have 5 blocks of 128MB each, that amounts to 5*128*3 = 1920 MB. True. But then these nodes are commodity hardware. We can easily scale the cluster to add more of these machines. The cost of buying machines is much lower than the cost of losing the data! - -Now, you must be wondering, how does Namenode decide which Datanode to store the replicas on? Well, before answering that question, we need to have a look at what is a Rack in Hadoop. - - - -What is a Rack in Hadoop? -A Rack is a collection of machines (30-40 in Hadoop) that are stored in the same physical location. There are multiple racks in a Hadoop cluster, all connected through switches. - -rack - -Rack awareness -Replica storage is a tradeoff between reliability and read/write bandwidth. To increase reliability, we need to store block replicas on different racks and Datanodes to increase fault tolerance. While the write bandwidth is lowest when replicas are stored on the same node. Therefore, Hadoop has a default strategy to deal with this conundrum, also known as the Rack Awareness algorithm. - -For example, if the replication factor for a block is 3, then the first replica is stored on the same Datanode on which the client writes. The second replica is stored on a different Datanode but on a different rack, chosen randomly. While the third replica is stored on the same rack as the second but on a different Datanode, again chosen randomly. If, however, the replication factor was higher, then the subsequent replicas would be stored on random Data Nodes in the cluster. - - rack awareness - - - -Endnotes -I hope by now you have got a solid understanding of what Hadoop Distributed File System(HDFS) is, what are its important components, and how it stores the data. There are however still a few more concepts that we need to cover with respect to Hadoop Distributed File System(HDFS), but that is a story for another article. - -For now, I recommend you go through the following articles to get a better understanding of Hadoop and this Big Data world! - -Hadoop Ecosystem -Introduction to MapReduce -Types of Tables in Apache Hive -Last but not the least, I recommend reading Hadoop: The Definitive Guide by Tom White. This article was highly inspired by it. \ No newline at end of file diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" deleted file mode 100644 index b5dfeed6371384b0b5746dc922d01952eb726c61..0000000000000000000000000000000000000000 Binary files "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt deleted file mode 100644 index 29ccf27c8a96e9747d79f32ab2a23ec10bf18885..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt +++ /dev/null @@ -1,457 +0,0 @@ -career&Exclusive offer&依赖 -career&Exclusive offer&依赖 -career&Big Data Course !!&依赖 -your&career& -career&Big Data Course !!&依赖 -your&questions& -world&system& -its&tolerance& -It&fault tolerance and high availability&依赖 -you&article&依赖 -you&HDFS Architecture Guide&依赖 -Hadoop HDFS&HDFS&GENERALIZATION -Assumptions and goal&HDFS design&AGGREGATION -HDFS architecture tutorial&NameNode , DataNode&依赖 -HDFS architecture tutorial&HDFS , Secondary node , checkpoint node , Backup Node&依赖 -HDFS architecture tutorial&HDFS&依赖 -HDFS architecture tutorial&detailed architecture&依赖 -detailed architecture&Hadoop HDFS&AGGREGATION -hdf feature&Rack awareness&依赖 -hdf feature&Rack awareness&依赖 -hdf store&running&依赖 -hdf store&large file&依赖 -cluster&commodity hardware&AGGREGATION -It&large file&依赖 -It&storage&依赖 -It&less number&依赖 -It&principle&依赖 -storage&less number&AGGREGATION -huge number&small file&AGGREGATION -less number&large file&AGGREGATION -principle&storage&AGGREGATION -HDFS stores datum&case&依赖 -case&hardware failure&AGGREGATION -hardware failure&failure&GENERALIZATION -HDFS stores datum&hardware failure&依赖 -HDFS instance&hundred or thousand&依赖 -HDFS instance&server machine&依赖 -hundred or thousand&server machine&AGGREGATION -system&data& -part&’s datum&AGGREGATION -huge number&component&AGGREGATION -core architectural goal&hdf&AGGREGATION -data access HDFS application&dataset&依赖 -data access HDFS application&streaming access&依赖 -their&datasets& -Hadoop hdf&user&依赖 -Hadoop hdf&batch processing&依赖 -Hadoop hdf&interactive use&依赖 -high throughput&data access&AGGREGATION -force&data access&依赖 -low latency&data access&AGGREGATION -force&data access&依赖 -force&low latency&依赖 -Large datasets hdf&large data set&依赖 -file&standard practice&依赖 -file&standard practice&依赖 -architecture&such a way&依赖 -huge amount&datum&AGGREGATION -architecture&best&依赖 -architecture&such a way&依赖 -architecture&hdf&AGGREGATION -architecture&best&依赖 -to hundred&node&AGGREGATION -ton&million&AGGREGATION -million&file&AGGREGATION -Simple coherency model&file&依赖 -Simple coherency model&write-once-read-many access model&依赖 -theory&write-once-read-many access model&AGGREGATION -Simple coherency model&theory&依赖 -MapReduce-based application or web crawler application&model&依赖 -main advantage&system&依赖 -overall throughput&system&AGGREGATION -main advantage&overall throughput&依赖 -It&network congestion&依赖 -it&one platform&依赖 -widespread adoption&hdf&AGGREGATION -large set&datum&AGGREGATION -Hadoop Distributed File System&master-slave architecture&依赖 -master-slave architecture&architecture&GENERALIZATION -cluster&single master node&依赖 -file&one or more block&依赖 -block&different slave machine&依赖 -you&this article )&依赖 -you&which&依赖 -block&file&AGGREGATION -master node store&master node store&依赖 -slave node&data block&依赖 -data block&file&AGGREGATION -slave node&file&依赖 -centerpiece&Hadoop Distributed File System&AGGREGATION -NameNode&Hadoop Distributed File System&依赖 -It&file system namespace&依赖 -namenode store information&two file&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&two file&依赖 -namenode store information&two file&依赖 -namenode store information&two file&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&namenode store information&依赖 -namenode store information&two file&依赖 -form&two file&AGGREGATION -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -namenode store information&two file&依赖 -namenode store information&two file&依赖 -namenode store information&form&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&form&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&form&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&local disk&依赖 -namenode store information&two file&依赖 -namenode store information&two file&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&two file&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&local disk&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&namenode store information&依赖 -namenode store information&two file&依赖 -namenode store information&local disk&依赖 -namenode store information&namenode store information&依赖 -namenode store information&form&依赖 -namenode store information&form&依赖 -fsimage stand&File System image&依赖 -fsimage stand&File System image&依赖 -It&NameNode creation&依赖 -It&complete namespace&依赖 -complete namespace&Hadoop file system&AGGREGATION -It&recent change&依赖 -It&file system namespace operation&依赖 -function&HDFS NameNode&AGGREGATION -NameNode&DataNodes&依赖 -mapping&block&AGGREGATION -It&DataNodes&依赖 -It&mapping&依赖 -It&block&依赖 -It&file&依赖 -namenode record&change&依赖 -namenode record&made&依赖 -It&location&依赖 -It&file&依赖 -location&block&AGGREGATION -It&block&依赖 -NameNode&care&依赖 -NameNode&block&依赖 -NameNode&replication factor&依赖 -replication factor&block&AGGREGATION -NameNode&datanode&依赖 -NameNode&heartbeat and block report&依赖 -NameNode&new datanode&依赖 -NameNode&new replica&依赖 -NameNode&failure&依赖 -NameNode&Hadoop2&依赖 -single point&failure&AGGREGATION -High Availability Hadoop cluster architecture&two or more namenode&依赖 -High Availability Hadoop cluster architecture&running&依赖 -High Availability Hadoop cluster architecture&two or more namenode&依赖 -High Availability Hadoop cluster architecture&running&依赖 -datanode&Hadoop HDFS&依赖 -They&file&依赖 -They&block&依赖 -function&DataNode DataNode&AGGREGATION -DataNodes&block creation&依赖 -datanode&heartbeat&依赖 -health&hdf&AGGREGATION -datanode&NameNode&依赖 -list&block&AGGREGATION -datanode&block report&依赖 -datanode&namenode&依赖 -hdf architecture secondary namenode&hdf architecture secondary namenode&依赖 -daemon&DataNode and NameNode&依赖 -daemon&DataNode and NameNode&依赖 -Secondary NameNode&primary NameNode&依赖 -Secondary NameNode&helper node&依赖 -helper node&node&GENERALIZATION -NameNode&file&依赖 -NameNode&restart&依赖 -NameNode&long time&依赖 -size&edit log&AGGREGATION -Secondary NameNode&issue&依赖 -Secondary NameNode&NameNode&GENERALIZATION -Secondary NameNode download&NameNode&依赖 -Secondary NameNode download&file&依赖 -It&Fsimage&依赖 -It&edit log&依赖 -its&restart& -updated Fsimage&NameNode&依赖 -NameNode&edit log record&依赖 -secondary NameNode&hdf&依赖 -secondary NameNode®ular checkpoint&依赖 -node&checkpoint&依赖 -node&namespace&依赖 -checkpoint&namespace&AGGREGATION -Checkpoint Node&hadoop first download fsimage&依赖 -Checkpoint Node&hadoop first download fsimage&依赖 -it&them Fsimage and edit&依赖 -it&Fsimage and edit&依赖 -it&new image&依赖 -it&new image&依赖 -directory&same structure&依赖 -directory&Namenode ’s directory&依赖 -It&latest checkpoint&依赖 -It&directory&依赖 -Backup node&node&GENERALIZATION -Backup node&in-memory , up-to-date copy&依赖 -Backup node&in-memory , up-to-date copy&依赖 -Backup node&file system namespace&依赖 -in-memory , up-to-date copy&file system namespace&AGGREGATION -Backup node&file system namespace&依赖 -It&active NameNode state&依赖 -It&namespace state&依赖 -It&up-to-date state&依赖 -It&namespace state&依赖 -up-to-date state&namespace state&AGGREGATION -It&up-to-date state&依赖 -it&namespace&依赖 -NameNode&time&依赖 -NameNode&one Backup node&依赖 -different type&node&AGGREGATION -we&HDFS&依赖 -we&HDFS Architecture tutorial&依赖 -we&Blocks&依赖 -us&block&依赖 -us&hdf&依赖 -hdf&block-sized chunk&依赖 -hdf&file&依赖 -size&block&AGGREGATION -size&128 mb&依赖 -size&default&依赖 -One&block size&依赖 -One&requirement&依赖 -block size&size&GENERALIZATION -hdf&four block&依赖 -size 128 Mb&128 Mb&GENERALIZATION -file&size 612 Mb&AGGREGATION -hdf&size 128 Mb&依赖 -four block&size 128 Mb&AGGREGATION -hdf&four block&依赖 -one block&size 100 Mb&AGGREGATION -hdf&size 128 Mb&依赖 -file&smaller size&AGGREGATION -file&full block size space&依赖 -file&2 Mb space&依赖 -file&size 2 Mb&AGGREGATION -file&disk&依赖 -user&location&依赖 -user&block&依赖 -user&control&依赖 -what&HDFS fault-tolerant&依赖 -datum&other machine&依赖 -datum&multiple place&依赖 -datum&distributed system&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&block&AGGREGATION -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&hdfs store replica&依赖 -hdfs store replica&Hadoop&依赖 -replication factor©&依赖 -number©&AGGREGATION -three copy&different datanode&依赖 -three copy&block&AGGREGATION -block&replica&依赖 -block&block&依赖 -block&other DataNode&依赖 -replica&block&AGGREGATION -data block&block&GENERALIZATION -128 = 384 ) 384 mb&disk space&AGGREGATION -we&128 Mb&依赖 -we&file&依赖 -file&128 Mb&AGGREGATION -128 = 384 ) 384 mb&file&依赖 -replication mechanism&HDFS fault-tolerant&依赖 -collection&40-50 machine datanode&AGGREGATION -Rack&40-50 machine datanode&依赖 -NameNode&rack awareness algorithm&依赖 -NameNode&replica&依赖 -second replica&same rack&依赖 -second replica&other DataNode&依赖 -client&hdf&依赖 -it&metada&依赖 -client&file&依赖 -it&NameNode&依赖 -their&location& -Namenode&block&依赖 -number&block&AGGREGATION -Namenode&number&依赖 -client&DataNode&依赖 -client&DataNode 1&依赖 -IP&other two datanode&AGGREGATION -client&block A&依赖 -Datanode 1&client&依赖 -datanode 1 copy&same rack&依赖 -datanode 1 copy&same rack&依赖 -datanode 1 copy&same block&依赖 -datanode 1 copy&same rack&依赖 -Datanode 1&block a&依赖 -datanode 1 copy&DataNode 2&依赖 -datanode 1 copy&same block&依赖 -datanode 1 copy&DataNode 2&依赖 -datanode 1 copy&same rack&依赖 -datanode 1 copy&same block&依赖 -DataNode 2&same rack&AGGREGATION -datanode 1 copy&DataNode 2&依赖 -datanode 1 copy&same block&依赖 -datanode 1 copy&DataNode 2&依赖 -DataNodes&transfer&依赖 -DataNodes&rack switch&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&different rack&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&same block&依赖 -datanode 2 copy&different rack&依赖 -it&Namenode&依赖 -DataNode&client&依赖 -it&confirmation&依赖 -DataNode&block&依赖 -same process&file&依赖 -same process&block&依赖 -client&metada&依赖 -client&NameNode&依赖 -Namenode&location&依赖 -location&datanode&AGGREGATION -client&DataNodes&依赖 -client&data parallelly&依赖 -datum&client&依赖 -datum&DataNode&依赖 -it&block&依赖 -it&form&依赖 -client or application&file&依赖 -client or application&block&依赖 -it&original file&依赖 -form&original file&AGGREGATION -client&Hadoop HDFS&依赖 -client&file&依赖 -file&data block&依赖 -file&A , B , C in ###&依赖 -file&A , B , C&依赖 -file&block&依赖 -block&different datanode&依赖 -Block A and datanode-1 ( dn-1 )&datanode-6 ( dn-6 )&依赖 -Block A and datanode-1 ( dn-1 )&block b&依赖 -Block A and datanode-1 ( dn-1 )&datanode-6 ( dn-6 )&依赖 -Block A and datanode-1 ( dn-1 )&block b&依赖 -2 replica&block&AGGREGATION -case&datanode failure or rack failure&AGGREGATION -we&file&依赖 -size&default&依赖 -we&requirement&依赖 -size&default&依赖 -we&which&依赖 -master node ( namenode ) store&metada&依赖 -master node ( namenode ) store&block location&依赖 -Master Node&DataNodes&依赖 -hdf&block&依赖 -hdf&replica&依赖 -NameNode&Rack Awareness algorithm&依赖 -our&course& -Your&Career& -you&difficulty&依赖 -you&HDFS Architecture tutorial&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt deleted file mode 100644 index 880e24ffa9038169edc8a908e17b9b19f9b29884..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt +++ /dev/null @@ -1,198 +0,0 @@ -Hadoop HDFS Architecture Explanation and Assumptions -Boost your career with Big Data Get Exclusive Offers on Big Data Course!! -This HDFS tutorial by DataFlair is designed to be an all in one package to answer all your questions about HDFS architecture. - -Hadoop Distributed File System(HDFS) is the world’s most reliable storage system. It is best known for its fault tolerance and high availability. - -In this article about HDFS Architecture Guide, you can read all about Hadoop HDFS. - -First of all, we will discuss what is HDFS next with the Assumptions and Goals of HDFS design. This HDFS architecture tutorial will also cover the detailed architecture of Hadoop HDFS including NameNode, DataNode in HDFS, Secondary node, checkpoint node, Backup Node in HDFS. - -HDFS features like Rack awareness, high Availability, Data Blocks, Replication Management, HDFS data read and write operations are also discussed in this HDFS tutorial. - -What is Hadoop HDFS? -HDFS stores very large files running on a cluster of commodity hardware. It works on the principle of storage of less number of large files rather than the huge number of small files. HDFS stores data reliably even in the case of hardware failure. It provides high throughput by providing the data access in parallel. - -HDFS Assumption and Goals -I. Hardware failure -Hardware failure is no more exception; it has become a regular term. HDFS instance consists of hundreds or thousands of server machines, each of which is storing part of the file system’s data. There exist a huge number of components that are very susceptible to hardware failure. This means that there are some components that are always non-functional. So the core architectural goal of HDFS is quick and automatic fault detection/recovery. - -II. Streaming data access -HDFS applications need streaming access to their datasets. Hadoop HDFS is mainly designed for batch processing rather than interactive use by users. The force is on high throughput of data access rather than low latency of data access. It focuses on how to retrieve data at the fastest possible speed while analyzing logs. - -III. Large datasets -HDFS works with large data sets. In standard practices, a file in HDFS is of size ranging from gigabytes to petabytes. The architecture of HDFS should be design in such a way that it should be best for storing and retrieving huge amounts of data. HDFS should provide high aggregate data bandwidth and should be able to scale up to hundreds of nodes on a single cluster. Also, it should be good enough to deal with tons of millions of files on a single instance. - -IV. Simple coherency model -It works on a theory of write-once-read-many access model for files. Once the file is created, written, and closed, it should not be changed. This resolves the data coherency issues and enables high throughput of data access. A MapReduce-based application or web crawler application perfectly fits in this model. As per apache notes, there is a plan to support appending writes to files in the future. - -V. Moving computation is cheaper than moving data -If an application does the computation near the data it operates on, it is much more efficient than done far of. This fact becomes stronger while dealing with large data set. The main advantage of this is that it increases the overall throughput of the system. It also minimizes network congestion. The assumption is that it is better to move computation closer to data instead of moving data to computation. - -VI. Portability across heterogeneous hardware and software platforms -HDFS is designed with the portable property so that it should be portable from one platform to another. This enables the widespread adoption of HDFS. It is the best platform while dealing with a large set of data. - -Introduction to HDFS Architecture -HDFS Architecture - -Hadoop Distributed File System follows the master-slave architecture. Each cluster comprises a single master node and multiple slave nodes. Internally the files get divided into one or more blocks, and each block is stored on different slave machines depending on the replication factor (which you will see later in this article). - -The master node stores and manages the file system namespace, that is information about blocks of files like block locations, permissions, etc. The slave nodes store data blocks of files. - -The Master node is the NameNode and DataNodes are the slave nodes. - -Let’s discuss each of the nodes in the Hadoop HDFS Architecture in detail. - -What is HDFS NameNode? -NameNode is the centerpiece of the Hadoop Distributed File System. It maintains and manages the file system namespace and provides the right access permission to the clients. - -The NameNode stores information about blocks locations, permissions, etc. on the local disk in the form of two files: - -Fsimage: Fsimage stands for File System image. It contains the complete namespace of the Hadoop file system since the NameNode creation. -Edit log: It contains all the recent changes performed to the file system namespace to the most recent Fsimage. -Functions of HDFS NameNode -It executes the file system namespace operations like opening, renaming, and closing files and directories. -NameNode manages and maintains the DataNodes. -It determines the mapping of blocks of a file to DataNodes. -NameNode records each change made to the file system namespace. -It keeps the locations of each block of a file. -NameNode takes care of the replication factor of all the blocks. -NameNode receives heartbeat and block reports from all DataNodes that ensure DataNode is alive. -If the DataNode fails, the NameNode chooses new DataNodes for new replicas. -Before Hadoop2, NameNode was the single point of failure. The High Availability Hadoop cluster architecture introduced in Hadoop 2, allows for two or more NameNodes running in the cluster in a hot standby configuration. - -What is HDFS DataNode? -DataNodes are the slave nodes in Hadoop HDFS. DataNodes are inexpensive commodity hardware. They store blocks of a file. - -Functions of DataNode -DataNode is responsible for serving the client read/write requests. -Based on the instruction from the NameNode, DataNodes performs block creation, replication, and deletion. -DataNodes send a heartbeat to NameNode to report the health of HDFS. -DataNodes also sends block reports to NameNode to report the list of blocks it contains. -What is Secondary NameNode? -HDFS architecture secondary namenode - -Apart from DataNode and NameNode, there is another daemon called the secondary NameNode. Secondary NameNode works as a helper node to primary NameNode but doesn’t replace primary NameNode. - -When the NameNode starts, the NameNode merges the Fsimage and edit logs file to restore the current file system namespace. Since the NameNode runs continuously for a long time without any restart, the size of edit logs becomes too large. This will result in a long restart time for NameNode. - -Secondary NameNode solves this issue. - -Secondary NameNode downloads the Fsimage file and edit logs file from NameNode. - -It periodically applies edit logs to Fsimage and refreshes the edit logs. The updated Fsimage is then sent to the NameNode so that NameNode doesn’t have to re-apply the edit log records during its restart. This keeps the edit log size small and reduces the NameNode restart time. - -If the NameNode fails, the last save Fsimage on the secondary NameNode can be used to recover file system metadata. The secondary NameNode performs regular checkpoints in HDFS. - -What is Checkpoint Node? -The Checkpoint node is a node that periodically creates checkpoints of the namespace. - -Checkpoint Node in Hadoop first downloads Fsimage and edits from the Active Namenode. Then it merges them (Fsimage and edits) locally, and at last, it uploads the new image back to the active NameNode. - -It stores the latest checkpoint in a directory that has the same structure as the Namenode’s directory. This permits the checkpointed image to be always available for reading by the NameNode if necessary. - -What is Backup Node? -A Backup node provides the same checkpointing functionality as the Checkpoint node. - -In Hadoop, Backup node keeps an in-memory, up-to-date copy of the file system namespace. It is always synchronized with the active NameNode state. - -It is not required for the backup node in HDFS architecture to download Fsimage and edits files from the active NameNode to create a checkpoint. It already has an up-to-date state of the namespace state in memory. - -The Backup node checkpoint process is more efficient as it only needs to save the namespace into the local Fsimage file and reset edits. NameNode supports one Backup node at a time. - -This was about the different types of nodes in HDFS Architecture. Further in this HDFS Architecture tutorial, we will learn about the Blocks in HDFS, Replication Management, Rack awareness and read/write operations. - -Let us now study the block in HDFS. - -What are Blocks in HDFS Architecture? -data blocks in hadoop HDFS - -Internally, HDFS split the file into block-sized chunks called a block. The size of the block is 128 Mb by default. One can configure the block size as per the requirement. - -For example, if there is a file of size 612 Mb, then HDFS will create four blocks of size 128 Mb and one block of size 100 Mb. - -The file of a smaller size does not occupy the full block size space in the disk. - -For example, the file of size 2 Mb will occupy only 2 Mb space in the disk. - -The user doesn’t have any control over the location of the blocks. - -Read the HDFS Block article to explore in detail. - -HDFS is highly fault-tolerant. Now, look at what makes HDFS fault-tolerant. - -What is Replication Management? -For a distributed system, the data must be redundant to multiple places so that if one machine fails, the data is accessible from other machines. - -In Hadoop, HDFS stores replicas of a block on multiple DataNodes based on the replication factor. - -The replication factor is the number of copies to be created for blocks of a file in HDFS architecture. - -If the replication factor is 3, then three copies of a block get stored on different DataNodes. So if one DataNode containing the data block fails, then the block is accessible from the other DataNode containing a replica of the block. - -If we are storing a file of 128 Mb and the replication factor is 3, then (3*128=384) 384 Mb of disk space is occupied for a file as three copies of a block get stored. - -This replication mechanism makes HDFS fault-tolerant. - -Read the Fault tolerance article to learn in detail. - -What is Rack Awareness in HDFS Architecture? -Let us now talk about how HDFS store replicas on the DataNodes? What is a rack? What is rack awareness? - -Rack is the collection of around 40-50 machines (DataNodes) connected using the same network switch. If the network goes down, the whole rack will be unavailable. - -Rack Awareness is the concept of choosing the closest node based on the rack information. - -To ensure that all the replicas of a block are not stored on the same rack or a single rack, NameNode follows a rack awareness algorithm to store replicas and provide latency and fault tolerance. - -Suppose if the replication factor is 3, then according to the rack awareness algorithm: - -The first replica will get stored on the local rack. -The second replica will get stored on the other DataNode in the same rack. -The third replica will get stored on a different rack. -HDFS Read and Write Operation -1. Write Operation - -When a client wants to write a file to HDFS, it communicates to the NameNode for metadata. The Namenode responds with a number of blocks, their location, replicas, and other details. Based on information from NameNode, the client directly interacts with the DataNode. - -The client first sends block A to DataNode 1 along with the IP of the other two DataNodes where replicas will be stored. When Datanode 1 receives block A from the client, DataNode 1 copies the same block to DataNode 2 of the same rack. As both the DataNodes are in the same rack, so block transfer via rack switch. Now DataNode 2 copies the same block to DataNode 4 on a different rack. As both the DataNoNes are in different racks, so block transfer via an out-of-rack switch. - -When DataNode receives the blocks from the client, it sends write confirmation to Namenode. - -The same process is repeated for each block of the file. - -2. Read Operation - -To read from HDFS, the client first communicates with the NameNode for metadata. The Namenode responds with the locations of DataNodes containing blocks. After receiving the DataNodes locations, the client then directly interacts with the DataNodes. - -The client starts reading data parallelly from the DataNodes based on the information received from the NameNode. The data will flow directly from the DataNode to the client. - -When a client or application receives all the blocks of the file, it combines these blocks into the form of an original file. - -Go through the HDFS read and write operation article to study how the client can read and write files in Hadoop HDFS. - -Overview Of HDFS Architecture -In Hadoop HDFS, NameNode is the master node and DataNodes are the slave nodes. The file in HDFS is stored as data blocks. - -The file is divided into blocks (A, B, C in the below GIF). These blocks get stored on different DataNodes based on the Rack Awareness Algorithm. Block A on DataNode-1(DN-1), block B on DataNode-6(DN-6), and block C on DataNode-7(DN-7). - -To provide Fault Tolerance, replicas of blocks are created based on the replication factor. - -In the below GIF, 2 replicas of each block is created (using default replication factor 3). Replicas were placed on different DataNodes, thus ensuring data availability even in the case of DataNode failure or rack failure. - -hadoop hdfs architecture - -So, This was all on HDFS Architecture Tutorial. Follow the following links to master HDFS architecture. - -Summary -After reading the HDFS architecture tutorial, we can conclude that the HDFS divides the files into blocks. The size of the block is 128 Mb by default, which we can configure as per the requirements. - -The master node (NameNode) stores and manages the metadata about block locations, blocks of a file, etc.The DataNode stores the actual data blocks. The Master Node manages the DataNodes. - -HDFS creates replicas of blocks and stores them on different DataNodes in order to provide fault tolerance. Also, NameNode uses the Rack Awareness algorithm to improve cluster performance. - -Loving Hadoop? Join our course and Boost Your Career with BIG DATA - -If you face any difficulty in this HDFS Architecture tutorial, please comment and ask. - -Keep Learning!! \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls deleted file mode 100644 index 77a7b1d4ed65c3c0dadc8805203c56f50298e501..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS.xls new file mode 100644 index 0000000000000000000000000000000000000000..2bd2c5de3a19f8cb9ff34732cecbf70208c93de6 Binary files /dev/null and b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop HDFS.xls differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt deleted file mode 100644 index f3ce0b17765db36bdf66e9250cf0cd8366610eaa..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt +++ /dev/null @@ -1,359 +0,0 @@ -post&monitor hadoop health and performance&依赖 -post&4-part series&依赖 -part 1&4-part series&AGGREGATION -Part 4&Hadoop deployment&依赖 -Part 2&part 3 detail&依赖 -Hadoop&metrics& -Apache Hadoop&large data set&依赖 -Apache Hadoop&Hadoop&GENERALIZATION -distributed computation and storage&large data set&AGGREGATION -Apache Hadoop&computer cluster&依赖 -Apache Hadoop&distributed computation and storage&依赖 -Hadoop&’s mapreduce programming model&实现 -Google&model& -rich ecosystem&related technology&AGGREGATION -Hadoop&Facebook&依赖 -Hadoop&widespread adoption&依赖 -Hadoop&many company&依赖 -Hadoop architecture overview Hadoop&three core component&依赖 -you&high availability&依赖 -hdf&term “ master ”&依赖 -hdf&primary node&依赖 -we&more inclusive term “ leader&依赖 -we&original term&依赖 -we&case&依赖 -file system&Hadoop cluster&AGGREGATION -Hadoop cluster&cluster&GENERALIZATION -file system ( hdf )&Hadoop cluster&依赖 -Several attribute&other distributed file system&依赖 -Several attribute&hdf&依赖 -default block size&128 MB&AGGREGATION -total&three copy&AGGREGATION -block&two replica&依赖 -datum&default replication factor&依赖 -default replication factor&three&AGGREGATION -datum&three&依赖 -hdf&it&依赖 -hdf&cluster&依赖 -Vanilla HDFS High-availability HDFS hdf&leader/follower architecture&依赖 -cluster&single NameNode&依赖 -event&failure )&AGGREGATION -arbitrary number&DataNodes&AGGREGATION -NameNode&file&依赖 -NameNode&once broker&依赖 -NameNode&file system namespace&依赖 -NameNode&client&依赖 -NameNode&addition&依赖 -NameNode&leader and brokers access&依赖 -its&state& -NameNode&state&依赖 -It&failure&依赖 -It&Hadoop cluster&依赖 -It&single point&依赖 -single point&failure&AGGREGATION -production cluster&case&依赖 -case&a single disk failure )&AGGREGATION -production cluster&state&依赖 -production cluster&a single disk failure )&依赖 -production cluster&state&依赖 -case&total machine failure )&AGGREGATION -standby NameNode&NameNode&GENERALIZATION -Hadoop&standby NameNode&依赖 -Earlier version&Hadoop&AGGREGATION -Earlier version&SecondaryNameNode concept&依赖 -introduction&SecondaryNameNode concept&AGGREGATION -Earlier version&introduction&依赖 -today&SecondaryNameNode&依赖 -Earlier version&alternative&依赖 -function&SecondaryNameNode&AGGREGATION -understand&explanation&依赖 -understand&mechanism&依赖 -NameNode&state&依赖 -explanation&mechanism&AGGREGATION -NameNode&mechanism&依赖 -fsimage&fsimage&依赖 -fsimage&NameNode stores file system metada&依赖 -fsimage&two different file&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -complete snapshot&’s metada&AGGREGATION -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -system&metadata& -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&’s metada&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&’s metada&依赖 -fsimage store&complete snapshot&依赖 -fsimage store&’s metada&依赖 -fsimage store&’s metada&依赖 -Incremental change&durability&依赖 -Incremental change&edit log&依赖 -NameNode&place&依赖 -NameNode&state&依赖 -separation&concern&AGGREGATION -NameNode&concern&依赖 -NameNode&state&依赖 -copy&fsimage&AGGREGATION -its©& -SecondaryNameNode&fsimage&依赖 -time change&edit log&依赖 -SecondaryNameNode©&依赖 -cluster administrator&fsimage&依赖 -NameNode&SecondaryNameNode&依赖 -cluster administrator&fsimage&依赖 -presence&SecondaryNameNode&AGGREGATION -NameNode&presence&依赖 -updated copy&fsimage&AGGREGATION -top&fsimage&AGGREGATION -cluster administrator&updated copy&依赖 -NameNode&edit log&依赖 -cluster administrator&updated copy&依赖 -event&NameNode failure&AGGREGATION -its&name& -HA NameNode service Early version&several concept&依赖 -HA NameNode service Early version&Hadoop&AGGREGATION -HA NameNode service Early version&secondarynamenode&依赖 -HA NameNode service Early version&other )&依赖 -mechanism&Hadoop 2.0&依赖 -event&primary NameNode failure&AGGREGATION -high availability&shared storage&依赖 -high availability&shared storage&依赖 -high availability&primary and standby&依赖 -high availability&edit log )&依赖 -high availability&primary and standby&依赖 -high availability&edit log )&依赖 -consistency&active and standby state&AGGREGATION -group&journalnodes ( jns )&AGGREGATION -majority&JournalNodes&AGGREGATION -Active node&namespace&依赖 -it&record&依赖 -it&JournalNodes&依赖 -record&change&AGGREGATION -it&change&依赖 -it&majority&依赖 -Active node&node&GENERALIZATION -StandbyNode&edit log&依赖 -its&namespace& -StandbyNode&jn&依赖 -StandbyNode&change&依赖 -QJM interaction diagram JournalNode daemon&low overhead&依赖 -daemon&Hadoop node&依赖 -daemon&same machine&依赖 -daemon&ResourceManager&依赖 -edit log change&JNs&依赖 -quorum&JNs&AGGREGATION -you&odd number&依赖 -odd number&least three daemon&AGGREGATION -you&running&依赖 -edit log change&quorum&依赖 -number&jn )&AGGREGATION -JournalNodes&failure&依赖 -JournalNodes&2 node&依赖 -N&jn )&依赖 -Alternative file systems hdf&Hadoop&依赖 -number&alternative file system&AGGREGATION -’s file system abstraction&local file system&依赖 -’s file system abstraction&alternative file system&依赖 -Azure&system& -’s file system abstraction&number&依赖 -Hadoop&abstraction& -file system&access URI&依赖 -production hdf&production hdf&依赖 -Most&limitation&依赖 -core&set&依赖 -core&MapReduce job&AGGREGATION -MapReduce job&job&GENERALIZATION -core&input datum&依赖 -collection&< key&AGGREGATION -top-level unit&work&AGGREGATION -job&one or more map&依赖 -canonical example&MapReduce job&AGGREGATION -canonical example&word frequency&依赖 -body&text&AGGREGATION -image&example&依赖 -MapReduce&own resource allocation and job scheduling&依赖 -care&own resource allocation and job scheduling&AGGREGATION -earlier version&hadoop ( pre-2 .0 )&AGGREGATION -its&allocation& -MapReduce&own resource allocation and job scheduling&依赖 -MapReduce&care&依赖 -MapReduce&care&依赖 -MapReduce&own resource allocation and job scheduling&依赖 -MapReduce&care&依赖 -MapReduce&own resource allocation and job scheduling&依赖 -MapReduce&care&依赖 -Newer version&computation&依赖 -allocation&computational resource&AGGREGATION -Newer version&scheduling&依赖 -Newer version&hadoop ( 2.0 + )&AGGREGATION -Hadoop&box&依赖 -number&framework&AGGREGATION -Hadoop&MapReduce&依赖 -article series&compute framework&依赖 -article series&MapReduce&依赖 -Hadoop architecture&three core component&依赖 -Hadoop architecture&three core component&依赖 -YARN&uncommon way&依赖 -YARN&common term&依赖 -most people&container ”&依赖 -a resource container ( rc )&collection&依赖 -collection&physical resource&AGGREGATION -a resource container ( rc )&physical resource&依赖 -it&new meaning&依赖 -“ Application ”&YARN&依赖 -application&set&依赖 -application&task&依赖 -set&task&AGGREGATION -MapReduce&concept& -Application&’s job concept&依赖 -Application&’s job concept&依赖 -ResourceManager The ResourceManager&YARN&依赖 -inventory&available resource&AGGREGATION -most important&which&AGGREGATION -scheduler scheduler component&YARN ResourceManager&AGGREGATION -scheduler scheduler component&resource&依赖 -it&application status or progress&依赖 -it&monitoring&依赖 -YARN&several scheduler policy&依赖 -YARN&Hadoop 2.7.2&依赖 -Scheduler&resource&依赖 -bundle&physical resource )&AGGREGATION -default scheduler&Hadoop distribution&依赖 -its&instance& -application&own dedicated ApplicationMaster instance&依赖 -application&own dedicated ApplicationMaster instance&依赖 -instance&one&依赖 -one&node&AGGREGATION -instance&node&依赖 -instance&cluster&依赖 -its&container& -instance&own , separate container&依赖 -’s applicationmaster&ResourceManager&依赖 -’s applicationmaster&heartbeat message&依赖 -application&ApplicationMaster& -assignment&Container Resource lease&AGGREGATION -Additional resource&assignment&依赖 -Additional resource&Container Resource lease&依赖 -Additional resource&ResourceManager&依赖 -ApplicationMaster&execution&依赖 -execution&application&AGGREGATION -ApplicationMaster&application&依赖 -ApplicationMaster&full lifespan&依赖 -its&lifespan& -their&lifecycles& -nodemanager&earlier version&依赖 -earlier version&Hadoop&AGGREGATION -nodemanager&tasktracker&依赖 -nodemanager&Hadoop&依赖 -nodemanager&dynamically create , arbitrarily-sized resource containers ( rc )&依赖 -nodemanager&number&依赖 -tasktracker&fixed number&依赖 -fixed number&map&AGGREGATION -tasktracker&map&依赖 -number&dynamically create , arbitrarily-sized resource containers ( rc )&AGGREGATION -application&flow&依赖 -application&flow&依赖 -application&flow&依赖 -application&flow&依赖 -Client program&MapReduce application&依赖 -MapReduce application&application&GENERALIZATION -Client program&ResourceManager&依赖 -ResourceManager&ApplicationMaster&依赖 -ResourceManager&container&依赖 -applicationmaster boot&original calling client&依赖 -applicationmaster boot&ResourceManager&依赖 -applicationmaster boot&ResourceManager&依赖 -applicationmaster boot&ResourceManager&依赖 -applicationmaster boot&original calling client&依赖 -applicationmaster boot&original calling client&依赖 -client application&application&GENERALIZATION -ApplicationMaster&client application&依赖 -ApplicationMaster&resource and ( resource container&依赖 -NodeManager&container&依赖 -ApplicationMaster&container launch specification&依赖 -ApplicationMaster&NodeManager&依赖 -NodeManager&application&依赖 -client poll&execution&依赖 -client poll&application status and progress&依赖 -applicationmaster deregister&completion&依赖 -applicationmaster deregister&ResourceManager&依赖 -its&containers& -applicationmaster deregister&completion&依赖 -applicationmaster deregister&completion&依赖 -applicationmaster deregister&ResourceManager&依赖 -applicationmaster deregister&ResourceManager&依赖 -coordination and synchronization&distributed system&AGGREGATION -high-availability&former single point&AGGREGATION -former single point&failure&AGGREGATION -NameNode&failure —&依赖 -previous version&Hadoop&AGGREGATION -NameNode&single point&依赖 -single point&failure —&AGGREGATION -Hadoop 2.0&high-availability NameNode service&依赖 -Hadoop 2.0&many improvement&依赖 -Hadoop 2.0&them&依赖 -ZooKeeper&qjm or nf&依赖 -it&automatic failover&依赖 -ZooKeeper&conjunction&依赖 -Automatic NameNode failover&two component&依赖 -NameNode&equivalent&依赖 -NameNode and Standby NameNodes&ZooKeeper&依赖 -NameNode and Standby NameNodes&persistent session&依赖 -NameNode&file or directory&依赖 -NameNode&a regular file system )&依赖 -NameNode&special , ephemeral “ lock ” znode&依赖 -its&session& -NameNode&ZooKeeper ensemble&依赖 -NameNode&contact&依赖 -equivalent&file or directory&AGGREGATION -health&node&AGGREGATION -a failover (&health&依赖 -a failover (&node&依赖 -other node&lock (&依赖 -new namenode transition&active NameNode&依赖 -new namenode transition&active NameNode&依赖 -new namenode transition&active NameNode&依赖 -its&ResourceManager& -Hadoop 2.4&’s resilience&依赖 -release&ResourceManager high-availability feature&AGGREGATION -Hadoop 2.4&ResourceManager high-availability feature&依赖 -YARN&resilience& -Hadoop 2.4&release&依赖 -event&primary ’s failure&AGGREGATION -new feature&ZooKeeper&依赖 -YARN&similar , ZooKeeper-managed lock&依赖 -YARN&hdf&依赖 -ActiveStandbyElector service&ResourceManager process&依赖 -its&service& -part&ResourceManager process&AGGREGATION -YARN&mechanism& -ActiveStandbyElector service&ephemeral znode&依赖 -ActiveStandbyElector service&control&依赖 -ActiveStandbyElector service&ZKFailoverController&依赖 -ActiveStandbyElector service&control&依赖 -ActiveStandbyElector service&ephemeral znode&依赖 -control&ephemeral znode and ActiveStandbyElectorLock&AGGREGATION -ActiveStandbyElector service&ZKFailoverController&依赖 -RM&lock&依赖 -active RM&session&依赖 -RM&active state&依赖 -RM&ActiveStandbyElectorLock&依赖 -we&found&依赖 -we&found&依赖 -we&core component&依赖 -we&core component&依赖 -examination&’s key performance metric and health indicator&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt deleted file mode 100644 index 26b458fe0aa746913c9988ea22390885585428ff..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt +++ /dev/null @@ -1,76 +0,0 @@ -Namenode,Namenode -NameNode,class INode -NameNode,class INodeReference -NameNode,class INodesInPath -NameNode,class INodeDirectory -NameNode,class INodeWithAdditionalFields -NameNode,class XAttrFeature -NameNode,class FileUnderConstructionFeature -NameNode,class DirectoryWithSnapshotFeature -NameNode,class DirectorySnapshottableFeature -NameNode,class AclFeature -NameNode,class DirectoryWithQuotaFeature -NameNode,class EditLogFileOutputStream -NameNode,class EditLogBackupOutputStream -NameNode,class QuorumOutputStream -NameNode,class JournalSetOutputStream -NameNode,class EditLogFileInputStream -NameNode,class EditLogBackupInputStream -NameNode,class EditLogByteInputStream -NameNode,class RedundantEditLogInputStream -INode,INode -INodeReference,INodeReference -INodesInPath,INodesInPath -INodeDirectory,INodeDirectory -INodeWithAdditionalFields,INodeWithAdditionalFields -Feature,Feature -XAttrFeature,XAttrFeature -FileUnderConstructionFeature,FileUnderConstructionFeature -DirectoryWithSnapshotFeature,DirectoryWithSnapshotFeature -DirectorySnapshottableFeature,DirectorySnapshottableFeature -AclFeature,AclFeature -DirectoryWithQuotaFeature,DirectoryWithQuotaFeature -EditLogFileOutputStream,EditLogFileOutputStream -EditLogBackupOutputStream,EditLogBackupOutputStream -EditLogBackupOutputStream,Edit log -QuorumOutputStream,QuorumOutputStream -QuorumOutputStream,ByteRangeInputStream -JournalSetOutputStream,JournalSetOutputStream -EditLogFileInputStream,EditLogFileInputStream -EditLogBackupInputStream,EditLogBackupInputStream -EditLogBackupInputStream,Edit log -EditLogByteInputStream,EditLogByteInputStream -EditLogByteInputStream,Edit log -RedundantEditLogInputStream,RedundantEditLogInputStream -Datanode,Datanode -DataNode,class Storage -DataNode,class DataStorage -DataNode,class StorageInfo -DataNode,class BlockPoolSlice -DataNode,class FsVolumeImpl -DataNode,class BlockManager -Tools,class DFSAdmin -Tools,class AdminHelper -Tools,class ECAdmin -Tools,class CryptoAdmin -Balancer,class DFSAdmin -Balancer,class AdminHelper -Balancer,class ECAdmin -Balancer,class CryptoAdmin -Protocol,class ClientProtocol -Protocol,class DataNodeProtocol -Protocol,class InterDataNodeProtocol -Security,class LightWeightHashSet -Security,class LightWeightLinkedSet -Security,class LinkedSetIterator -Security,class ImageVisitor -Security,class LsImageVisitor -Security,class XmlImageVisitor -Security,class FileDistributionVisitor -Security,class IndentedImageVisitor -Client,class DFSClient -Client,class DFSOutputStream -Client,class DfsClientConf -Client,class BlockReaderFactory -Client,class StrippedDataStreamer -Common,class Command diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt deleted file mode 100644 index 117d4bb994227d32f316e7e1518d473bba5a51d7..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt +++ /dev/null @@ -1,76 +0,0 @@ -Namenode , Namenode -NameNode , class I Node -NameNode , class I Node Reference -NameNode , class I Nodes In Path -NameNode , class I Node Directory -NameNode , class I Node With Additional Fields -NameNode , class X Attr Feature -NameNode , class File Under Construction Feature -NameNode , class Directory With Snapshot Feature -NameNode , class Directory Snapshottable Feature -NameNode , class Acl Feature -NameNode , class Directory With Quota Feature -NameNode , class Edit Log File Output Stream -NameNode , class Edit Log Backup Output Stream -NameNode , class Quorum Output Stream -NameNode , class Journal Set Output Stream -NameNode , class Edit Log File Input Stream -NameNode , class Edit Log Backup Input Stream -NameNode , class Edit Log Byte Input Stream -NameNode , class Redundant Edit Log Input Stream -INode , INode -INode Reference , INode Reference -INodes In Path , INodes In Path -INode Directory , INode Directory -INode With Additional Fields , INode With Additional Fields -Feature , Feature -XAttr Feature , XAttr Feature -File Under Construction Feature , File Under Construction Feature -Directory With Snapshot Feature , Directory With Snapshot Feature -Directory Snapshottable Feature , Directory Snapshottable Feature -Acl Feature , Acl Feature -Directory With Quota Feature , Directory With Quota Feature -Edit Log File Output Stream , Edit Log File Output Stream -Edit Log Backup Output Stream , Edit Log Backup Output Stream -Edit Log Backup Output Stream , Edit log -Quorum Output Stream , Quorum Output Stream -Quorum Output Stream , Byte Range Input Stream -Journal Set Output Stream , Journal Set Output Stream -Edit Log File Input Stream , Edit Log File Input Stream -Edit Log Backup Input Stream , Edit Log Backup Input Stream -Edit Log Backup Input Stream , Edit log -Edit Log Byte Input Stream , Edit Log Byte Input Stream -Edit Log Byte Input Stream , Edit log -Redundant Edit Log Input Stream , Redundant Edit Log Input Stream -Datanode , Datanode -DataNode , class Storage -DataNode , class Data Storage -DataNode , class Storage Info -DataNode , class Block Pool Slice -DataNode , class Fs Volume Impl -DataNode , class Block Manager -Tools , class D F S Admin -Tools , class Admin Helper -Tools , class E C Admin -Tools , class Crypto Admin -Balancer , class D F S Admin -Balancer , class Admin Helper -Balancer , class E C Admin -Balancer , class Crypto Admin -Protocol , class Client Protocol -Protocol , class Data Node Protocol -Protocol , class Inter Data Node Protocol -Security , class Light Weight Hash Set -Security , class Light Weight Linked Set -Security , class Linked Set Iterator -Security , class Image Visitor -Security , class Ls Image Visitor -Security , class Xml Image Visitor -Security , class File Distribution Visitor -Security , class Indented Image Visitor -Client , class D F S Client -Client , class D F S Output Stream -Client , class Dfs Client Conf -Client , class Block Reader Factory -Client , class Stripped Data Streamer -Common , class Command \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt deleted file mode 100644 index c37613f4b7c9efa2e2ebbee297d61d3a3c1f6596..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt +++ /dev/null @@ -1,149 +0,0 @@ -This post is part 1 of a 4-part series on monitoring Hadoop health and performance. Part 2 dives into the key metrics to monitor, Part 3 details how to monitor Hadoop performance natively, and Part 4 explains how to monitor a Hadoop deployment with Datadog. - -In this post, we’ll explore each of the technologies that make up a typical Hadoop deployment, and see how they all fit together. If you’re already familiar with HDFS, MapReduce, and YARN, feel free to continue on to Part 2 to dive right into Hadoop’s key performance metrics. - -What is Hadoop? -Apache Hadoop is a framework for distributed computation and storage of very large data sets on computer clusters. Hadoop began as a project to implement Google’s MapReduce programming model, and has become synonymous with a rich ecosystem of related technologies, not limited to: Apache Pig, Apache Hive, Apache Spark, Apache HBase, and others. - -Hadoop has seen widespread adoption by many companies including Facebook, Yahoo!, Adobe, Cisco, eBay, Netflix, and Datadog. - -Hadoop architecture overview -Hadoop has three core components, plus ZooKeeper if you want to enable high availability: - -Hadoop Distributed File System (HDFS) -MapReduce -Yet Another Resource Negotiator (YARN) -ZooKeeper -Note that HDFS uses the term “master” to describe the primary node in a cluster. Where possible, we will use the more inclusive term “leader.” In cases where using an alternative term would introduce ambiguity, such as the YARN-specific class name ApplicationMaster, we preserve the original term. - -HDFS architecture -The Hadoop Distributed File System (HDFS) is the underlying file system of a Hadoop cluster. It provides scalable, fault-tolerant, rack-aware data storage designed to be deployed on commodity hardware. Several attributes set HDFS apart from other distributed file systems. Among them, some of the key differentiators are that HDFS is: - -designed with hardware failure in mind -built for large datasets, with a default block size of 128 MB -optimized for sequential operations -rack-aware -cross-platform and supports heterogeneous clusters -Data in a Hadoop cluster is broken down into smaller units (called blocks) and distributed throughout the cluster. Each block is duplicated twice (for a total of three copies), with the two replicas stored on two nodes in a rack somewhere else in the cluster. Since the data has a default replication factor of three, it is highly available and fault-tolerant. If a copy is lost (because of machine failure, for example), HDFS will automatically re-replicate it elsewhere in the cluster, ensuring that the threefold replication factor is maintained. - -HDFS architecture can vary, depending on the Hadoop version and features needed: - -Vanilla HDFS -High-availability HDFS -HDFS is based on a leader/follower architecture. Each cluster is typically composed of a single NameNode, an optional SecondaryNameNode (for data recovery in the event of failure), and an arbitrary number of DataNodes. - -Hadoop architecture - Vanilla Hadoop deployment diagramA vanilla Hadoop deployment -In addition to managing the file system namespace and associated metadata (file-to-block maps), the NameNode acts as the leader and brokers access to files by clients (though once brokered, clients communicate directly with DataNodes). The NameNode operates entirely in memory, persisting its state to disk. It represents a single point of failure for a Hadoop cluster that is not running in high-availability mode. To mitigate against this, production clusters typically persist state to two local disks (in case of a single disk failure) and also to an NFS-mounted volume (in case of total machine failure). In high-availability mode, Hadoop maintains a standby NameNode to guard against failures. Earlier versions of Hadoop offered an alternative with the introduction of the SecondaryNameNode concept, and many clusters today still operate with a SecondaryNameNode. - -To understand the function of the SecondaryNameNode requires an explanation of the mechanism by which the NameNode stores its state. - -fsimage and the edit log -The NameNode stores file system metadata in two different files: the fsimage and the edit log. The fsimage stores a complete snapshot of the file system’s metadata at a specific moment in time. Incremental changes (like renaming or appending a few bytes to a file) are then stored in the edit log for durability, rather than creating a new fsimage snapshot each time the namespace is modified. With this separation of concerns in places, the NameNode can restore its state by loading the fsimage and performing all the transforms from the edit log, restoring the file system to its most recent state. - -Hadoop architecture - Secondary NameNode architecture diagram -Through RPC calls, the SecondaryNameNode is able to independently update its copy of the fsimage each time changes are made to the edit log. Thus, if the NameNode goes down in the presence of a SecondaryNameNode, the NameNode doesn’t need to replay the edit log on top of the fsimage; cluster administrators can retrieve an updated copy of the fsimage from the SecondaryNameNode. - -SecondaryNameNodes provide a means for much faster recovery in the event of NameNode failure. Despite its name, though, it is not a drop-in replacement for the NameNode and does not provide a means for automated failover. - -HA NameNode service -Early versions of Hadoop introduced several concepts (like SecondaryNameNodes, among others) to make the NameNode more resilient. With Hadoop 2.0 and Standby NameNodes, a mechanism for true high availability was realized. - -Standby NameNodes, which are incompatible with SecondaryNameNodes, provide automatic failover in the event of primary NameNode failure. Achieving high availability with Standby NameNodes requires shared storage between the primary and standbys (for the edit log). - -Though there are two options for the necessary shared storage—NFS and Quorum Journal Manager(QJM)—only QJM is considered production-ready. - -NameNode and QJM -Using the Quorum Journal Manager (QJM) is the preferred method for achieving high availability for HDFS. - -Using QJM to maintain consistency of Active and Standby state requires that both nodes be able to communicate with a group of JournalNodes (JNs). When the Active node modifies the namespace, it logs a record of the change to a majority of JournalNodes. The StandbyNode watches the JNs for changes to the edit log and applies them to its own namespace. - -Hadoop architecture - QJM interaction diagram -JournalNode daemons have relatively low overhead, so provisioning additional machines for them is unnecessary—the daemons can be run on the same machines as existing Hadoop nodes. Typically, a daemon is run on the ResourceManager as well as on each of the two NameNodes. Because edit log changes require a quorum of JNs, you must maintain an odd number of at least three daemons running at any one time. JournalNodes can tolerate failures of at most (N - 1) / 2 nodes (where N is the number of JNs). - -Alternative file systems -HDFS is the canonical file system for Hadoop, but Hadoop’s file system abstraction supports a number of alternative file systems, including the local file system, FTP, AWS S3, Azure’s file system, and OpenStack’s Swift. The file system used is determined by the access URI, e.g., file: for the local file system, s3: for data stored on Amazon S3, etc. Most of these have limitations, though, and in production HDFS is almost always the file system used for the cluster. - -MapReduce overview -MapReduce is a framework tailor-made for processing large datasets in a distributed fashion across multiple machines. The core of a MapReduce job can be, err, reduced to three operations: map an input data set into a collection of pairs, shuffle the resulting data (transfer data to the reducers), then reduce over all pairs with the same key. - -The top-level unit of work in MapReduce is a job. Each job is composed of one or more map or reduce tasks. - -The canonical example of a MapReduce job is counting word frequencies in a body of text. The image below illustrates such an example: - -Hadoop architecture - MapReduce word frequency flow diagram -Key differences between versions -In earlier versions of Hadoop (pre-2.0), MapReduce took care of its own resource allocation and job scheduling as well as the actual computation. - -Newer versions of Hadoop (2.0+) decouple the scheduling from the computation with YARN, which handles the allocation of computational resources for MapReduce jobs. This allows other processing frameworks (see below) to share the cluster without resource contention. - -Other frameworks -Though Hadoop comes with MapReduce out of the box, a number of computing frameworks have been developed for or adapted to the Hadoop ecosystem. Among the more popular are Apache Spark and Apache Tez. This article series will focus on MapReduce as the compute framework. - -Untangling YARN -YARN (Yet Another Resource Negotiator) is the framework responsible for assigning computational resources for application execution. - -Hadoop architecture - YARN architecture diagram -YARN consists of three core components: - -ResourceManager (one per cluster) -ApplicationMaster (one per application) -NodeManagers (one per node) -Caution, overloaded terms ahead -YARN uses some very common terms in uncommon ways. For example, when most people hear “container”, they think Docker. In the Hadoop ecosystem, it takes on a new meaning: a Resource Container (RC) represents a collection of physical resources. It is an abstraction used to bundle resources into distinct, allocatable units. - -“Application” is another overloaded term—in YARN, an application represents a set of tasks that are to be executed together. Application in YARN is synonymous with MapReduce’s job concept. - -ResourceManager -The ResourceManager is the rack-aware leader node in YARN. It is responsible for taking inventory of available resources and runs several critical services, the most important of which is the Scheduler. - -Scheduler -The Scheduler component of the YARN ResourceManager allocates resources to running applications. It is a pure scheduler in that it does not monitor or track application status or progress. As it performs no monitoring, it cannot guarantee that tasks will restart should they fail. - -As of Hadoop 2.7.2, YARN supports several scheduler policies: the CapacityScheduler, the FairScheduler, and the FIFO (first in first out) Scheduler. The default scheduler varies by Hadoop distribution, but no matter the policy used, the Scheduler allocates resources by assigning containers (bundles of physical resources) to the requesting ApplicationMaster. - -ApplicationMaster -Each application running on Hadoop has its own dedicated ApplicationMaster instance. This instance lives in its own, separate container on one of the nodes in the cluster. Each application’s ApplicationMaster periodically sends heartbeat messages to the ResourceManager, as well as requests for additional resources, if needed. Additional resources are granted by the ResourceManager through the assignment of Container Resource leases, which serve as reservations for containers on NodeManagers. - -The ApplicationMaster oversees the execution of an application over its full lifespan, from requesting additional containers from the ResourceManger, to submitting container release requests to the NodeManager. - -NodeManagers -The NodeManager is a per-node agent tasked with overseeing containers throughout their lifecycles, monitoring container resource usage, and periodically communicating with the ResourceManager. - -Conceptually, NodeManagers are much like TaskTrackers in earlier versions of Hadoop. Whereas TaskTrackers used a fixed number of map and reduce slots for scheduling, NodeManagers have a number of dynamically created, arbitrarily-sized Resource Containers (RCs). Unlike slots in MR1, RCs can be used for map tasks, reduce tasks, or tasks from other frameworks. - -Executing applications with YARN -Hadoop architecture - YARN application execution diagram -Typical application execution with YARN follows this flow: - -Client program submits the MapReduce application to the ResourceManager, along with information to launch the application-specific ApplicationMaster. -ResourceManager negotiates a container for the ApplicationMaster and launches the ApplicationMaster. -ApplicationMaster boots and registers with the ResourceManager, allowing the original calling client to interface directly with the ApplicationMaster. -ApplicationMaster negotiates resources (resource containers) for client application. -ApplicationMaster gives the container launch specification to the NodeManager, which launches a container for the application. -During execution, client polls ApplicationMaster for application status and progress. -Upon completion, ApplicationMaster deregisters with the ResourceManager and shuts down, returning its containers to the resource pool. -ZooKeeper -Apache ZooKeeper is a popular tool used for coordination and synchronization of distributed systems. Since Hadoop 2.0, ZooKeeper has become an essential service for Hadoop clusters, providing a mechanism for enabling high-availability of former single points of failure, specifically the HDFS NameNode and YARN ResourceManager. - -HDFS and ZooKeeper -Hadoop architecture - NameNode HA with ZooKeeper diagram -In previous versions of Hadoop, the NameNode represented a single point of failure—should the NameNode fail, the entire HDFS cluster would become unavailable as the metadata containing the file-to-block mappings would be lost. - -Hadoop 2.0 brought many improvements, among them a high-availability NameNode service. When ZooKeeper is used in conjunction with QJM or NFS, it enables automatic failover. - -Automatic NameNode failover requires two components: a ZooKeeper quorum, and a ZKFailoverController (ZKFC) process running on each NameNode. The NameNode and Standby NameNodes maintain persistent sessions in ZooKeeper, with the NameNode holding a special, ephemeral “lock” znode (the equivalent of a file or directory, in a regular file system); if the NameNode does not maintain contact with the ZooKeeper ensemble, its session is expired, triggering a failover (handled by ZKFC). - -ZKFailoverController is a process that runs alongside the NameNode and Standby NameNodes, periodically checking the health of the node it is running on. On healthy nodes, ZKFC will try to acquire the lock znode, succeeding if no other node holds the lock (which means the primary NameNode has failed). Once the lock is acquired, the new NameNode transitions to the active NameNode. - -YARN and ZooKeeper -Hadoop architecture - ResourceManager HA with ZooKeeper diagram -When YARN was initially created, its ResourceManager represented a single point of failure—if NodeManagers lost contact with the ResourceManager, all jobs in progress would be halted, and no new jobs could be assigned. - -Hadoop 2.4 improved YARN’s resilience with the release of the ResourceManager high-availability feature. The new feature incorporates ZooKeeper to allow for automatic failover to a standby ResourceManager in the event of the primary’s failure. - -Like HDFS, YARN uses a similar, ZooKeeper-managed lock to ensure only one ResourceManager is active at once. Unlike HDFS, YARN’s automatic failover mechanism does not run as a separate process—instead, its ActiveStandbyElector service is part of the ResourceManager process itself. Like ZKFailoverController, the ActiveStandbyElector service on each ResourceManager continuously vies for control of an ephemeral znode, ActiveStandbyElectorLock. Because the node is ephemeral, if the currently active RM allows the session to expire, the RM that successfully acquires a lock on the ActiveStandbyElectorLock will automatically be promoted to the active state. - -From theory, to practice -In this post, we’ve explored all the core components found in a standard Hadoop cluster. - -Read on to the next article in this series for an examination of Hadoop’s key performance metrics and health indicators. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls deleted file mode 100644 index cc7d5d602764c95dce4b84b3f1d7fc495c5527bc..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls and /dev/null differ diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" deleted file mode 100644 index eeb4d17504acdf54850585ebe2c2dc45d921f33b..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" +++ /dev/null @@ -1,152 +0,0 @@ -Hadoop cluster&kove ® xpd ™ persistent memory&依赖 -Hadoop cluster&kove ® xpd ™ persistent memory&依赖 -its&information& -Hadoop cluster&NameNode server&依赖 -failure and source&data loss&AGGREGATION -Hadoop cluster&most vital information&依赖 -Hadoop cluster&RAM&依赖 -potential point&failure and source&AGGREGATION -its&server& -usual precaution&form&依赖 -datum&case&依赖 -its&operation& -datum&failure&依赖 -case&failure&AGGREGATION -it&many hour&依赖 -one&hadoop functionality&依赖 -one&hadoop functionality&依赖 -one&fast restoration&依赖 -one&fast restoration&依赖 -one&fast restoration&依赖 -fast restoration&hadoop functionality&AGGREGATION -one&fast restoration&依赖 -one&hadoop functionality&依赖 -Hadoop NameNode&NameNode&GENERALIZATION -one&hadoop functionality&依赖 -Hadoop software&NameNode&依赖 -Hadoop software&memory space&依赖 -Hadoop software&memory space&依赖 -modified version&Hadoop software&AGGREGATION -memory space&NameNode&AGGREGATION -Hadoop software&NameNode&依赖 -Standard RAM&yet another limitation&依赖 -Standard RAM&Hadoop&依赖 -Standard RAM&RAM&GENERALIZATION -it&size&依赖 -One&file&依赖 -One&much datum&依赖 -Kove XPD&size&依赖 -removal&limitation&AGGREGATION -Kove XPD&contrast&依赖 -its&block& -sake&efficiency&AGGREGATION -it&failure&依赖 -single point&failure&AGGREGATION -prospect&persistent memory&依赖 -prospect&NameNode&依赖 -advantage&implementation&AGGREGATION -power failure&failure&GENERALIZATION -Persistent memory&power failure&依赖 -device&terabyte&依赖 -size&Kove memory&AGGREGATION -NameNode&more information&依赖 -NameNode&much memory&依赖 -number&file&AGGREGATION -diagram below&thought&依赖 -our&thoughts& -summary&approach&AGGREGATION -our&approaches& -we&major actor&依赖 -structure&FSImage structure INode&依赖 -structure&INode&依赖 -structure&CorruptReplicasMap and recentInvalidateSets and PendingBlockInfo and ExcessReplicateMap and PendingReplicationBlocks and UnderReplicatedBlocks&依赖 -structure&interest&AGGREGATION -new list&block&AGGREGATION -/ /&block&依赖 -/ /&new list&依赖 -/ /&new list&依赖 -/ /&block&依赖 -possible way&usage&依赖 -possible way&special buffer&依赖 -usage&special buffer&AGGREGATION -Registration&time (&依赖 -Registration&100 microsecond&依赖 -we&4 way&依赖 -we&it&依赖 -buffer&start&依赖 -cost&data transfer to/from other memory area&AGGREGATION -start&NameNode&AGGREGATION -buffer&NameNode&依赖 -different chunk&datum&AGGREGATION -buffer&time&依赖 -combination&a ) and ( b )&AGGREGATION -we&Kove&依赖 -we&it&依赖 -we&place and transfer&依赖 -area&interest&AGGREGATION -we&buffer&依赖 -May&additional code&依赖 -May&deal&依赖 -May&caching&依赖 -overhead&what&依赖 -Easiest to implement&library created buffer&依赖 -we&NameNode change&实现 -we&NameNode change&实现 -we&EHCache library&实现 -we&EHCache library&实现 -your&database& -We&it&依赖 -implementation&github here and https://github.com/markkerzner/nn_kove&依赖 -combination&teragen/terasort&AGGREGATION -testing&use nnbench&依赖 -testing&use nnbench&依赖 -result&run&AGGREGATION -performance&cluster&AGGREGATION -50 %&in-memory Hadoop code&AGGREGATION -KDSA block device&block device&GENERALIZATION -initial prototype&Kove XPD&依赖 -our&prototype& -initial prototype&KDSA block device&依赖 -block device&device&GENERALIZATION -performance&block device&AGGREGATION -proper way&direct write&依赖 -C interface&performance&依赖 -proper way&Java&依赖 -C interface&block device&依赖 -we&more meticulous implementation&依赖 -four group&test result&AGGREGATION -slots_millis_maps =&6462&依赖 -Launched map task&2&依赖 -slots_millis_reduces =&9238&依赖 -Bytes Read&= 50000000&依赖 -File&Counters& -file_bytes_read =&51000264&依赖 -hdfs_bytes_read =&50000218&依赖 -file_bytes_written =&102164352&依赖 -hdfs_bytes_written =&50000000&依赖 -spill records = 1000000&spill records = 1000000&依赖 -split_raw_bytes =&218&依赖 -Reduce input record&500000&依赖 -Reduce input group&500000&依赖 -Reduce output record&500000&依赖 -Number&file&AGGREGATION -# map&barrier&依赖 -#&exception&AGGREGATION -slots_millis_maps =&6541&依赖 -slots_millis_reduces =&9293&依赖 -file_bytes_written =&102156988&依赖 -slots_millis_maps =&6249&依赖 -slots_millis_reduces =&9218&依赖 -file_bytes_written =&102156990&依赖 -slots_millis_maps =&6390&依赖 -slots_millis_reduces =&9240&依赖 -file_bytes_written =&102162937&依赖 -fast block copy&datum&AGGREGATION -Planned enhancement&fuller utilitzation&依赖 -its©& -Planned enhancement&capability&依赖 -terabyte&datum&AGGREGATION -fuller utilitzation&capability&AGGREGATION -fast block copy&terabyte&AGGREGATION -matter&second&AGGREGATION -capability&Kove XPD&AGGREGATION diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" deleted file mode 100644 index accbf130a9e4e2b71e0bcee12873c1ddfdf6d3af..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" +++ /dev/null @@ -1,1138 +0,0 @@ -Hadoop clusters with Kove® XPD™ persistent memory - - -Mark Kerzner (mark@hadoopilluminated.com), Greg Keller (greg@r-hpc.com), Ivan Lazarov (ivan.lazarov@shmsoft.com), Sujee Maniyam (sujee@hadoopilluminated.com) - - -Abstract - - -Since the Hadoop cluster stores its most vital information on its NameNode server in RAM, this represents a potential point of failure and source of data loss. The usual precautions take the form of storing this information on multiple local hard drives and on a remote one. However, even if the data is preserved in the case of failure, it make take many hours to restore the cluster to its operation. - - -In contrast, by running the Hadoop NameNode on the Kove XPD, one can achieve very fast restoration of Hadoop functionality after such failures as a power loss or motherboard failure. This is accomplished by running a modified version of the Hadoop software, which maps the memory space of the NameNode onto the Kove XPD and can be found on GitHub here https://github.com/markkerzner/nn_kove. - - -Standard RAM presents yet another limitation on Hadoop: it makes it limited in size. One can only store as much data (files and blocks) as the RAM will allow. By contrast, Kove XPD is unlimited in size, and thus using it results in the removal of this limitation on the Hadoop cluster size. - - -Background - - -The Hadoop NameNode saves all of its block and file information in memory. This is done for the sake of efficiency, but it is naturally a single point of failure. There are multiple approaches to alleviate this SPOF, ranging from NameNode HA in Hadoop 2 to distributed NameNode. - - -However, a very enticing prospect is running NameNode on persistent memory, provided by the Kove XPD device. - - -The advantages of this implementation would be twofold. - - -1. Persistent memory is resistant to power failure. If this approach is proven viable, the software architecture for Hadoop NameNode on Kove can be simplified. - - -2. The size of the Kove memory is virtually unlimited, and these devices can be scaled well beyond a terabyte. With this much memory, the NameNode can store much more information, lifting the limitations on the number of files stored in Hadoop and obviating the need for federation. - - - -The diagram below summarizes our thoughts up until this point. - - - - -Possible approaches - - -Here is the summary of our approaches we have tried. - - -Given that the NameNode stores following data in memory (simplified view), we have these major actors. - - -machine -> blockList (DataNodeMap, DatanodeDescriptor, BlockInfo) - -block -> machineList (BlocksMap, BlockInfo) - - -Also these structures are referenced within FSImage structures (INode, FSDirectory) and some additional structures like CorruptReplicasMap, recentInvalidateSets, PendingBlockInfo, ExcessReplicateMap, PendingReplicationBlocks, UnderReplicatedBlocks. - - -All structures of interest are centered around FSNamesystem and are relatively tightly coupled, which implies careful refactoring with small steps. - -Here is how new HDFS file is created -1). Client -> Namenode - - DFSClient - -DFSOutputStream - - namenode.addBlock (through RPC) - - FSNameSystem.getAdditionalBlock() - - lease - - replicator.chooseTargets() -> DatanodeDesciptor[] - - newBlock = allocateBlock(src, pathINodes); - - FSDirectory.addBlock - - // associate the new list of blocks with this file - -namesystem.blocksMap.addINode(block, fileNode); - -BlockInfo blockInfo = namesystem.blocksMap.getStoredBlock(block); - - fileNode.addBlock(blockInfo); - -pendingFile.setTargets(targets); - - -2). Client -> Datanode - -connect to Datanode directly and transfer data... - -Class diagram displaying some of affected classes - - -Possible ways to implement storing Namenode data on Kove instead of memory -Data exchange with Kove requires usage of special buffer registered with API. Registration takes time (about 100 microseconds), and same for copying data to/from buffers (each read/write about 1 microsecond). - -Thus we have 4 ways to use it: - - -a. Create and register big buffer ourselves and do all modifications right inside the buffer. The buffer has to fit into normal memory. This is the fastest: buffer is created once at the start of NameNode, and cost of data transfer to/from other memory areas is minimized. This is the question: “Will it be easy to implement access to ‘native’ data structure from DataNode and does this bring more overhead?” This is likely the longest to implement. - - -b. Create and register smaller buffer(s) ourselves and use them multiple times for different chunks of data. This will require moving data to and from buffers which takes some time. But hopefully this also means fewer changes in DataNode data structures. Not limited to normal memory size. - - -c. Some combination of (a) and (b): try to cache most frequently/recently accessed areas (blocks?) in buffers registered for data exchange. When a certain area is in memory, we modify it in place and transfer to Kove. If not, we obsolete some buffer and use it for area of interest. May be an improvement over (b) but additional code to deal with caching, and need to see what overhead will caching itself give. - - -d. Have API registering memory and transferring the data behind the scenes. Easiest to implement, but probably the slowest: will read/write to library created buffers and occasionally have new buffers registered - - - -Our implementation - - -In the end, we have implemented the NameNode changes using the EHCache library. Ehcache is an open source, standards-based cache for boosting performance, offloading your database, and simplifying scalability. It's the most widely-used Java-based cache because it's robust, proven, and full-featured. - - -We used it to replace the Java objects with EHCache objects and stored them to the XPD. Now came the time to test. - - -This implementation can be found on github here, https://github.com/markkerzner/nn_kove. - - -Testing - - -For testing with used NNBench and a combination of teragen/terasort. The results of the runs are given below. - - -One may notice that the performance of the cluster when using Kove is about 50% of the in-memory Hadoop code. This is to be expected. For our initial prototype treated the Kove XPD as KDSA block device, since it was easier to implement. The proper way, however, will be to use direct writes with Java to C interface, which has the performance of twice the block device. Thus, with the more meticulous implementation we would achieve the speed comparable to in-memory Hadoop code. - - -Appendix: test results - - -There are four groups of test results given below. - - -============ BLOCKSMAP + KOVE ============ - - ----- terasort ---- - - - -hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input - -hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output - - -13/08/07 07:12:53 INFO mapred.JobClient: map 0% reduce 0% - -13/08/07 07:12:58 INFO mapred.JobClient: map 100% reduce 0% - -13/08/07 07:13:05 INFO mapred.JobClient: map 100% reduce 33% - -13/08/07 07:13:07 INFO mapred.JobClient: map 100% reduce 100% - -13/08/07 07:13:07 INFO mapred.JobClient: Job complete: job_201308070712_0002 - -13/08/07 07:13:07 INFO mapred.JobClient: Counters: 30 - -13/08/07 07:13:07 INFO mapred.JobClient: Job Counters - -13/08/07 07:13:07 INFO mapred.JobClient: Launched reduce tasks=1 - -13/08/07 07:13:07 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6462 - -13/08/07 07:13:07 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 - -13/08/07 07:13:07 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 - -13/08/07 07:13:07 INFO mapred.JobClient: Rack-local map tasks=2 - -13/08/07 07:13:07 INFO mapred.JobClient: Launched map tasks=2 - -13/08/07 07:13:07 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9238 - -13/08/07 07:13:07 INFO mapred.JobClient: File Input Format Counters - -13/08/07 07:13:07 INFO mapred.JobClient: Bytes Read=50000000 - -13/08/07 07:13:07 INFO mapred.JobClient: File Output Format Counters - -13/08/07 07:13:07 INFO mapred.JobClient: Bytes Written=50000000 - -13/08/07 07:13:07 INFO mapred.JobClient: FileSystemCounters - -13/08/07 07:13:07 INFO mapred.JobClient: FILE_BYTES_READ=51000264 - -13/08/07 07:13:07 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 - -13/08/07 07:13:07 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102164352 - -13/08/07 07:13:07 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 - -13/08/07 07:13:07 INFO mapred.JobClient: Map-Reduce Framework - -13/08/07 07:13:07 INFO mapred.JobClient: Map output materialized bytes=51000012 - -13/08/07 07:13:07 INFO mapred.JobClient: Map input records=500000 - -13/08/07 07:13:07 INFO mapred.JobClient: Reduce shuffle bytes=51000012 - -13/08/07 07:13:07 INFO mapred.JobClient: Spilled Records=1000000 - -13/08/07 07:13:07 INFO mapred.JobClient: Map output bytes=50000000 - -13/08/07 07:13:07 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 - -13/08/07 07:13:07 INFO mapred.JobClient: CPU time spent (ms)=6860 - -13/08/07 07:13:07 INFO mapred.JobClient: Map input bytes=50000000 - -13/08/07 07:13:07 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 - -13/08/07 07:13:07 INFO mapred.JobClient: Combine input records=0 - -13/08/07 07:13:07 INFO mapred.JobClient: Reduce input records=500000 - -13/08/07 07:13:07 INFO mapred.JobClient: Reduce input groups=500000 - -13/08/07 07:13:07 INFO mapred.JobClient: Combine output records=0 - -13/08/07 07:13:07 INFO mapred.JobClient: Physical memory (bytes) snapshot=615641088 - -13/08/07 07:13:07 INFO mapred.JobClient: Reduce output records=500000 - -13/08/07 07:13:07 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2303033344 - -13/08/07 07:13:07 INFO mapred.JobClient: Map output records=500000 - -13/08/07 07:13:07 INFO terasort.TeraSort: done - - - -map() completion: 1.0 - -reduce() completion: 1.0 - - -Counters: 30 - - Job Counters - - Launched reduce tasks=1 - - SLOTS_MILLIS_MAPS=6462 - - - Total time spent by all reduces waiting after reserving slots (ms)=0 - - Total time spent by all maps waiting after reserving slots (ms)=0 - - Rack-local map tasks=2 - - Launched map tasks=2 - - SLOTS_MILLIS_REDUCES=9238 - - - File Input Format Counters - - Bytes Read=50000000 - - File Output Format Counters - - Bytes Written=50000000 - - FileSystemCounters - - FILE_BYTES_READ=51000264 - - HDFS_BYTES_READ=50000218 - - FILE_BYTES_WRITTEN=102164352 - - - HDFS_BYTES_WRITTEN=50000000 - - Map-Reduce Framework - - Map output materialized bytes=51000012 - - Map input records=500000 - - Reduce shuffle bytes=51000012 - - Spilled Records=1000000 - - Map output bytes=50000000 - - Total committed heap usage (bytes)=602996736 - - CPU time spent (ms)=6860 - - - Map input bytes=50000000 - - SPLIT_RAW_BYTES=218 - - Combine input records=0 - - Reduce input records=500000 - - Reduce input groups=500000 - - Combine output records=0 - - Physical memory (bytes) snapshot=615641088 - - Reduce output records=500000 - - Virtual memory (bytes) snapshot=2303033344 - - Map output records=500000 - - - ----- nn bench ---- - - -hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 - - -13/08/07 07:53:08 INFO hdfs.NNBench: -------------- NNBench -------------- : - -13/08/07 07:53:08 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 - -13/08/07 07:53:08 INFO hdfs.NNBench: Date & time: 2013-08-07 07:53:08,57 - -13/08/07 07:53:08 INFO hdfs.NNBench: - -13/08/07 07:53:08 INFO hdfs.NNBench: Test Operation: create_write - -13/08/07 07:53:08 INFO hdfs.NNBench: Start time: 2013-08-07 07:45:15,177 - -13/08/07 07:53:08 INFO hdfs.NNBench: Maps to run: 2 - -13/08/07 07:53:08 INFO hdfs.NNBench: Reduces to run: 1 - -13/08/07 07:53:08 INFO hdfs.NNBench: Block Size (bytes): 1 - -13/08/07 07:53:08 INFO hdfs.NNBench: Bytes to write: 20 - -13/08/07 07:53:08 INFO hdfs.NNBench: Bytes per checksum: 1 - -13/08/07 07:53:08 INFO hdfs.NNBench: Number of files: 100 - -13/08/07 07:53:08 INFO hdfs.NNBench: Replication factor: 1 - -13/08/07 07:53:08 INFO hdfs.NNBench: Successful file operations: 200 - -13/08/07 07:53:08 INFO hdfs.NNBench: - -13/08/07 07:53:08 INFO hdfs.NNBench: # maps that missed the barrier: 0 - -13/08/07 07:53:08 INFO hdfs.NNBench: # exceptions: 0 - -13/08/07 07:53:08 INFO hdfs.NNBench: - -13/08/07 07:53:08 INFO hdfs.NNBench: TPS: Create/Write/Close: 65 - -13/08/07 07:53:08 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 60.03 - -13/08/07 07:53:08 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.58 - -13/08/07 07:53:08 INFO hdfs.NNBench: Avg Lat (ms): Close: 56.375 - -13/08/07 07:53:08 INFO hdfs.NNBench: - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: AL Total #1: 716 - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: AL Total #2: 11275 - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 12006 - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 6143.0 - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: Late maps: 0 - -13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 - - -------------------------------------------------------------------------------------------------------------------- - -============ BLOCKSMAP + DISK ============ - - ----- terasort ---- - - - -hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input - -hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output - - -13/08/07 08:06:46 INFO mapred.JobClient: Running job: job_201308070806_0002 - -13/08/07 08:06:47 INFO mapred.JobClient: map 0% reduce 0% - -13/08/07 08:06:52 INFO mapred.JobClient: map 100% reduce 0% - -13/08/07 08:06:59 INFO mapred.JobClient: map 100% reduce 33% - -13/08/07 08:07:01 INFO mapred.JobClient: map 100% reduce 100% - -13/08/07 08:07:01 INFO mapred.JobClient: Job complete: job_201308070806_0002 - -13/08/07 08:07:01 INFO mapred.JobClient: Counters: 30 - -13/08/07 08:07:01 INFO mapred.JobClient: Job Counters - -13/08/07 08:07:01 INFO mapred.JobClient: Launched reduce tasks=1 - -13/08/07 08:07:01 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6541 - -13/08/07 08:07:01 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 - -13/08/07 08:07:01 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 - -13/08/07 08:07:01 INFO mapred.JobClient: Rack-local map tasks=2 - -13/08/07 08:07:01 INFO mapred.JobClient: Launched map tasks=2 - -13/08/07 08:07:01 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9293 - -13/08/07 08:07:01 INFO mapred.JobClient: File Input Format Counters - -13/08/07 08:07:01 INFO mapred.JobClient: Bytes Read=50000000 - -13/08/07 08:07:01 INFO mapred.JobClient: File Output Format Counters - -13/08/07 08:07:01 INFO mapred.JobClient: Bytes Written=50000000 - -13/08/07 08:07:01 INFO mapred.JobClient: FileSystemCounters - -13/08/07 08:07:01 INFO mapred.JobClient: FILE_BYTES_READ=51000264 - -13/08/07 08:07:01 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 - -13/08/07 08:07:01 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102156988 - -13/08/07 08:07:01 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 - -13/08/07 08:07:01 INFO mapred.JobClient: Map-Reduce Framework - -13/08/07 08:07:01 INFO mapred.JobClient: Map output materialized bytes=51000012 - -13/08/07 08:07:01 INFO mapred.JobClient: Map input records=500000 - -13/08/07 08:07:01 INFO mapred.JobClient: Reduce shuffle bytes=51000012 - -13/08/07 08:07:01 INFO mapred.JobClient: Spilled Records=1000000 - -13/08/07 08:07:01 INFO mapred.JobClient: Map output bytes=50000000 - -13/08/07 08:07:01 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 - -13/08/07 08:07:01 INFO mapred.JobClient: CPU time spent (ms)=6940 - -13/08/07 08:07:01 INFO mapred.JobClient: Map input bytes=50000000 - -13/08/07 08:07:01 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 - -13/08/07 08:07:01 INFO mapred.JobClient: Combine input records=0 - -13/08/07 08:07:01 INFO mapred.JobClient: Reduce input records=500000 - -13/08/07 08:07:01 INFO mapred.JobClient: Reduce input groups=500000 - -13/08/07 08:07:01 INFO mapred.JobClient: Combine output records=0 - -13/08/07 08:07:01 INFO mapred.JobClient: Physical memory (bytes) snapshot=612827136 - -13/08/07 08:07:01 INFO mapred.JobClient: Reduce output records=500000 - -13/08/07 08:07:01 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2305966080 - -13/08/07 08:07:01 INFO mapred.JobClient: Map output records=500000 - -13/08/07 08:07:01 INFO terasort.TeraSort: done - - - -Counters: 30 - - Job Counters - - Launched reduce tasks=1 - - SLOTS_MILLIS_MAPS=6541 - - - Total time spent by all reduces waiting after reserving slots (ms)=0 - - Total time spent by all maps waiting after reserving slots (ms)=0 - - Rack-local map tasks=2 - - Launched map tasks=2 - - SLOTS_MILLIS_REDUCES=9293 - - - File Input Format Counters - - Bytes Read=50000000 - - File Output Format Counters - - Bytes Written=50000000 - - FileSystemCounters - - FILE_BYTES_READ=51000264 - - HDFS_BYTES_READ=50000218 - - FILE_BYTES_WRITTEN=102156988 - - - HDFS_BYTES_WRITTEN=50000000 - - Map-Reduce Framework - - Map output materialized bytes=51000012 - - Map input records=500000 - - Reduce shuffle bytes=51000012 - - Spilled Records=1000000 - - Map output bytes=50000000 - - Total committed heap usage (bytes)=602996736 - - CPU time spent (ms)=6940 - - - Map input bytes=50000000 - - SPLIT_RAW_BYTES=218 - - Combine input records=0 - - Reduce input records=500000 - - Reduce input groups=500000 - - Combine output records=0 - - Physical memory (bytes) snapshot=612827136 - - Reduce output records=500000 - - Virtual memory (bytes) snapshot=2305966080 - - Map output records=500000 - - - - - ----- nn bench ---- - - -hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 - - -13/08/07 08:11:17 INFO hdfs.NNBench: -------------- NNBench -------------- : - -13/08/07 08:11:17 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 - -13/08/07 08:11:17 INFO hdfs.NNBench: Date & time: 2013-08-07 08:11:17,388 - -13/08/07 08:11:17 INFO hdfs.NNBench: - -13/08/07 08:11:17 INFO hdfs.NNBench: Test Operation: create_write - -13/08/07 08:11:17 INFO hdfs.NNBench: Start time: 2013-08-07 08:11:01,121 - -13/08/07 08:11:17 INFO hdfs.NNBench: Maps to run: 2 - -13/08/07 08:11:17 INFO hdfs.NNBench: Reduces to run: 1 - -13/08/07 08:11:17 INFO hdfs.NNBench: Block Size (bytes): 1 - -13/08/07 08:11:17 INFO hdfs.NNBench: Bytes to write: 20 - -13/08/07 08:11:17 INFO hdfs.NNBench: Bytes per checksum: 1 - -13/08/07 08:11:17 INFO hdfs.NNBench: Number of files: 100 - -13/08/07 08:11:17 INFO hdfs.NNBench: Replication factor: 1 - -13/08/07 08:11:17 INFO hdfs.NNBench: Successful file operations: 200 - -13/08/07 08:11:17 INFO hdfs.NNBench: - -13/08/07 08:11:17 INFO hdfs.NNBench: # maps that missed the barrier: 0 - -13/08/07 08:11:17 INFO hdfs.NNBench: # exceptions: 0 - -13/08/07 08:11:17 INFO hdfs.NNBench: - -13/08/07 08:11:17 INFO hdfs.NNBench: TPS: Create/Write/Close: 65 - -13/08/07 08:11:17 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 58.86 - -13/08/07 08:11:17 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.18 - -13/08/07 08:11:17 INFO hdfs.NNBench: Avg Lat (ms): Close: 55.59 - -13/08/07 08:11:17 INFO hdfs.NNBench: - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: AL Total #1: 636 - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: AL Total #2: 11118 - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 11772 - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 6122.0 - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: Late maps: 0 - -13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 - -13/08/07 08:11:17 INFO hdfs.NNBench: - - - -============ REGULAR HADOOP + DISK ============ - - ----- terasort ---- - - - -hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input - -hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output - - -13/08/07 08:26:03 INFO mapred.JobClient: Running job: job_201308070825_0002 - -13/08/07 08:26:04 INFO mapred.JobClient: map 0% reduce 0% - -13/08/07 08:26:08 INFO mapred.JobClient: map 100% reduce 0% - -13/08/07 08:26:15 INFO mapred.JobClient: map 100% reduce 33% - -13/08/07 08:26:17 INFO mapred.JobClient: map 100% reduce 100% - -13/08/07 08:26:17 INFO mapred.JobClient: Job complete: job_201308070825_0002 - -13/08/07 08:26:17 INFO mapred.JobClient: Counters: 30 - -13/08/07 08:26:17 INFO mapred.JobClient: Job Counters - -13/08/07 08:26:17 INFO mapred.JobClient: Launched reduce tasks=1 - -13/08/07 08:26:17 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6249 - -13/08/07 08:26:17 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 - -13/08/07 08:26:17 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 - -13/08/07 08:26:17 INFO mapred.JobClient: Launched map tasks=2 - -13/08/07 08:26:17 INFO mapred.JobClient: Data-local map tasks=2 - -13/08/07 08:26:17 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9218 - -13/08/07 08:26:17 INFO mapred.JobClient: File Input Format Counters - -13/08/07 08:26:17 INFO mapred.JobClient: Bytes Read=50000000 - -13/08/07 08:26:17 INFO mapred.JobClient: File Output Format Counters - -13/08/07 08:26:17 INFO mapred.JobClient: Bytes Written=50000000 - -13/08/07 08:26:17 INFO mapred.JobClient: FileSystemCounters - -13/08/07 08:26:17 INFO mapred.JobClient: FILE_BYTES_READ=51000264 - -13/08/07 08:26:17 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 - -13/08/07 08:26:17 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102156990 - -13/08/07 08:26:17 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 - -13/08/07 08:26:17 INFO mapred.JobClient: Map-Reduce Framework - -13/08/07 08:26:17 INFO mapred.JobClient: Map output materialized bytes=51000012 - -13/08/07 08:26:17 INFO mapred.JobClient: Map input records=500000 - -13/08/07 08:26:17 INFO mapred.JobClient: Reduce shuffle bytes=51000012 - -13/08/07 08:26:17 INFO mapred.JobClient: Spilled Records=1000000 - -13/08/07 08:26:17 INFO mapred.JobClient: Map output bytes=50000000 - -13/08/07 08:26:17 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 - -13/08/07 08:26:17 INFO mapred.JobClient: CPU time spent (ms)=6690 - -13/08/07 08:26:17 INFO mapred.JobClient: Map input bytes=50000000 - -13/08/07 08:26:17 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 - -13/08/07 08:26:17 INFO mapred.JobClient: Combine input records=0 - -13/08/07 08:26:17 INFO mapred.JobClient: Reduce input records=500000 - -13/08/07 08:26:17 INFO mapred.JobClient: Reduce input groups=500000 - -13/08/07 08:26:17 INFO mapred.JobClient: Combine output records=0 - -13/08/07 08:26:17 INFO mapred.JobClient: Physical memory (bytes) snapshot=609116160 - -13/08/07 08:26:17 INFO mapred.JobClient: Reduce output records=500000 - -13/08/07 08:26:17 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2309636096 - -13/08/07 08:26:17 INFO mapred.JobClient: Map output records=500000 - -13/08/07 08:26:17 INFO terasort.TeraSort: done - - - -Counters: 30 - - Job Counters - - Launched reduce tasks=1 - - SLOTS_MILLIS_MAPS=6249 - - - Total time spent by all reduces waiting after reserving slots (ms)=0 - - Total time spent by all maps waiting after reserving slots (ms)=0 - - Launched map tasks=2 - - Data-local map tasks=2 - - SLOTS_MILLIS_REDUCES=9218 - - - File Input Format Counters - - Bytes Read=50000000 - - File Output Format Counters - - Bytes Written=50000000 - - FileSystemCounters - - FILE_BYTES_READ=51000264 - - HDFS_BYTES_READ=50000218 - - FILE_BYTES_WRITTEN=102156990 - - - HDFS_BYTES_WRITTEN=50000000 - - Map-Reduce Framework - - Map output materialized bytes=51000012 - - Map input records=500000 - - Reduce shuffle bytes=51000012 - - Spilled Records=1000000 - - Map output bytes=50000000 - - Total committed heap usage (bytes)=602996736 - - CPU time spent (ms)=6690 - - - Map input bytes=50000000 - - SPLIT_RAW_BYTES=218 - - Combine input records=0 - - Reduce input records=500000 - - Reduce input groups=500000 - - Combine output records=0 - - Physical memory (bytes) snapshot=609116160 - - Reduce output records=500000 - - Virtual memory (bytes) snapshot=2309636096 - - Map output records=500000 - - - ----- nn bench ---- - - -hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 - - -13/08/07 08:30:45 INFO hdfs.NNBench: -------------- NNBench -------------- : - -13/08/07 08:30:45 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 - -13/08/07 08:30:45 INFO hdfs.NNBench: Date & time: 2013-08-07 08:30:45,180 - -13/08/07 08:30:45 INFO hdfs.NNBench: - -13/08/07 08:30:45 INFO hdfs.NNBench: Test Operation: create_write - -13/08/07 08:30:45 INFO hdfs.NNBench: Start time: 2013-08-07 08:30:30,955 - -13/08/07 08:30:45 INFO hdfs.NNBench: Maps to run: 2 - -13/08/07 08:30:45 INFO hdfs.NNBench: Reduces to run: 1 - -13/08/07 08:30:45 INFO hdfs.NNBench: Block Size (bytes): 1 - -13/08/07 08:30:45 INFO hdfs.NNBench: Bytes to write: 20 - -13/08/07 08:30:45 INFO hdfs.NNBench: Bytes per checksum: 1 - -13/08/07 08:30:45 INFO hdfs.NNBench: Number of files: 100 - -13/08/07 08:30:45 INFO hdfs.NNBench: Replication factor: 1 - -13/08/07 08:30:45 INFO hdfs.NNBench: Successful file operations: 200 - -13/08/07 08:30:45 INFO hdfs.NNBench: - -13/08/07 08:30:45 INFO hdfs.NNBench: # maps that missed the barrier: 0 - -13/08/07 08:30:45 INFO hdfs.NNBench: # exceptions: 0 - -13/08/07 08:30:45 INFO hdfs.NNBench: - -13/08/07 08:30:45 INFO hdfs.NNBench: TPS: Create/Write/Close: 87 - -13/08/07 08:30:45 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 42.895 - -13/08/07 08:30:45 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.16 - -13/08/07 08:30:45 INFO hdfs.NNBench: Avg Lat (ms): Close: 39.655 - -13/08/07 08:30:45 INFO hdfs.NNBench: - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: AL Total #1: 632 - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: AL Total #2: 7931 - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 8579 - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 4547.0 - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: Late maps: 0 - -13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 - -13/08/07 08:30:45 INFO hdfs.NNBench: - - -============ REGULAR HADOOP + KOVE ============ - - ----- terasort ---- - - - -hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input - -hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output - - -13/08/07 08:35:25 INFO mapred.JobClient: Running job: job_201308070834_0002 - -13/08/07 08:35:26 INFO mapred.JobClient: map 0% reduce 0% - -13/08/07 08:35:31 INFO mapred.JobClient: map 100% reduce 0% - -13/08/07 08:35:38 INFO mapred.JobClient: map 100% reduce 33% - -13/08/07 08:35:40 INFO mapred.JobClient: map 100% reduce 100% - -13/08/07 08:35:40 INFO mapred.JobClient: Job complete: job_201308070834_0002 - -13/08/07 08:35:40 INFO mapred.JobClient: Counters: 30 - -13/08/07 08:35:40 INFO mapred.JobClient: Job Counters - -13/08/07 08:35:40 INFO mapred.JobClient: Launched reduce tasks=1 - -13/08/07 08:35:40 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6390 - -13/08/07 08:35:40 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 - -13/08/07 08:35:40 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 - -13/08/07 08:35:40 INFO mapred.JobClient: Rack-local map tasks=2 - -13/08/07 08:35:40 INFO mapred.JobClient: Launched map tasks=2 - -13/08/07 08:35:40 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9240 - -13/08/07 08:35:40 INFO mapred.JobClient: File Input Format Counters - -13/08/07 08:35:40 INFO mapred.JobClient: Bytes Read=50000000 - -13/08/07 08:35:40 INFO mapred.JobClient: File Output Format Counters - -13/08/07 08:35:40 INFO mapred.JobClient: Bytes Written=50000000 - -13/08/07 08:35:40 INFO mapred.JobClient: FileSystemCounters - -13/08/07 08:35:40 INFO mapred.JobClient: FILE_BYTES_READ=51000264 - -13/08/07 08:35:40 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 - -13/08/07 08:35:40 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102162937 - -13/08/07 08:35:40 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 - -13/08/07 08:35:40 INFO mapred.JobClient: Map-Reduce Framework - -13/08/07 08:35:40 INFO mapred.JobClient: Map output materialized bytes=51000012 - -13/08/07 08:35:40 INFO mapred.JobClient: Map input records=500000 - -13/08/07 08:35:40 INFO mapred.JobClient: Reduce shuffle bytes=51000012 - -13/08/07 08:35:40 INFO mapred.JobClient: Spilled Records=1000000 - -13/08/07 08:35:40 INFO mapred.JobClient: Map output bytes=50000000 - -13/08/07 08:35:40 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 - -13/08/07 08:35:40 INFO mapred.JobClient: CPU time spent (ms)=6660 - -13/08/07 08:35:40 INFO mapred.JobClient: Map input bytes=50000000 - -13/08/07 08:35:40 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 - -13/08/07 08:35:40 INFO mapred.JobClient: Combine input records=0 - -13/08/07 08:35:40 INFO mapred.JobClient: Reduce input records=500000 - -13/08/07 08:35:40 INFO mapred.JobClient: Reduce input groups=500000 - -13/08/07 08:35:40 INFO mapred.JobClient: Combine output records=0 - -13/08/07 08:35:40 INFO mapred.JobClient: Physical memory (bytes) snapshot=611500032 - -13/08/07 08:35:40 INFO mapred.JobClient: Reduce output records=500000 - -13/08/07 08:35:40 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2300420096 - -13/08/07 08:35:40 INFO mapred.JobClient: Map output records=500000 - -13/08/07 08:35:40 INFO terasort.TeraSort: done - - - -Counters: 30 - - Job Counters - - Launched reduce tasks=1 - - SLOTS_MILLIS_MAPS=6390 - - - Total time spent by all reduces waiting after reserving slots (ms)=0 - - Total time spent by all maps waiting after reserving slots (ms)=0 - - Rack-local map tasks=2 - - Launched map tasks=2 - - SLOTS_MILLIS_REDUCES=9240 - - - File Input Format Counters - - Bytes Read=50000000 - - File Output Format Counters - - Bytes Written=50000000 - - FileSystemCounters - - FILE_BYTES_READ=51000264 - - HDFS_BYTES_READ=50000218 - - FILE_BYTES_WRITTEN=102162937 - - - HDFS_BYTES_WRITTEN=50000000 - - Map-Reduce Framework - - Map output materialized bytes=51000012 - - Map input records=500000 - - Reduce shuffle bytes=51000012 - - Spilled Records=1000000 - - Map output bytes=50000000 - - Total committed heap usage (bytes)=602996736 - - CPU time spent (ms)=6660 - - - Map input bytes=50000000 - - SPLIT_RAW_BYTES=218 - - Combine input records=0 - - Reduce input records=500000 - - Reduce input groups=500000 - - Combine output records=0 - - Physical memory (bytes) snapshot=611500032 - - Reduce output records=500000 - - Virtual memory (bytes) snapshot=2300420096 - - Map output records=500000 - - - ----- nn bench ---- - - -hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 - - -13/08/07 08:42:43 INFO hdfs.NNBench: -------------- NNBench -------------- : - -13/08/07 08:42:43 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 - -13/08/07 08:42:43 INFO hdfs.NNBench: Date & time: 2013-08-07 08:42:43,678 - -13/08/07 08:42:43 INFO hdfs.NNBench: - -13/08/07 08:42:43 INFO hdfs.NNBench: Test Operation: create_write - -13/08/07 08:42:43 INFO hdfs.NNBench: Start time: 2013-08-07 08:42:29,426 - -13/08/07 08:42:43 INFO hdfs.NNBench: Maps to run: 2 - -13/08/07 08:42:43 INFO hdfs.NNBench: Reduces to run: 1 - -13/08/07 08:42:43 INFO hdfs.NNBench: Block Size (bytes): 1 - -13/08/07 08:42:43 INFO hdfs.NNBench: Bytes to write: 20 - -13/08/07 08:42:43 INFO hdfs.NNBench: Bytes per checksum: 1 - -13/08/07 08:42:43 INFO hdfs.NNBench: Number of files: 100 - -13/08/07 08:42:43 INFO hdfs.NNBench: Replication factor: 1 - -13/08/07 08:42:43 INFO hdfs.NNBench: Successful file operations: 200 - -13/08/07 08:42:43 INFO hdfs.NNBench: - -13/08/07 08:42:43 INFO hdfs.NNBench: # maps that missed the barrier: 0 - -13/08/07 08:42:43 INFO hdfs.NNBench: # exceptions: 0 - -13/08/07 08:42:43 INFO hdfs.NNBench: - -13/08/07 08:42:43 INFO hdfs.NNBench: TPS: Create/Write/Close: 90 - -13/08/07 08:42:43 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 42.665 - -13/08/07 08:42:43 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.015 - -13/08/07 08:42:43 INFO hdfs.NNBench: Avg Lat (ms): Close: 39.61 - -13/08/07 08:42:43 INFO hdfs.NNBench: - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: AL Total #1: 603 - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: AL Total #2: 7922 - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 8533 - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 4437.0 - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: Late maps: 0 - -13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 - -13/08/07 08:42:43 INFO hdfs.NNBench: - - - -Conclusion - - -It has been shown that running Hadoop NameNode on a Kove XPD improves cluster reliability and removes the memory size limitation usual for the RAM-based NameNode. - - -Planned enhancements include making fuller utilitzation of all the capabilities of the Kove XPD, described here http://kove.com/, such as its fast block copy of terabytes of data in a matter of seconds. \ No newline at end of file diff --git "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" "b/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" deleted file mode 100644 index c8dfe8d64e2c402e7860c1ac020527c429887335..0000000000000000000000000000000000000000 Binary files "a/src/main/resources/sdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt deleted file mode 100644 index cc6b4ee069bb658a04c893c7d0434b9596a7e483..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt +++ /dev/null @@ -1,176 +0,0 @@ -get start key design&hdfs architecture march 31 , 2021 hdf ( hadoop&AGGREGATION -get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 -get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 -get start key design&get start key design&依赖 -get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 -get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 -get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 -get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 -get start key design&get start key design&依赖 -get start key design&get start key design&依赖 -get start key design&get start key design&依赖 -get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 -get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 -get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 -get start key design&get start key design&依赖 -get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 -several feature&design&AGGREGATION -It&Hadoop framework&实现 -Hadoop framework&framework&GENERALIZATION -hdf ( hadoop&key difference&依赖 -hdf ( hadoop&key difference&依赖 -hdf ( hadoop&other distributed file system&依赖 -fault-tolerance&low-cost hardware&依赖 -hdf ( hadoop&other distributed file system&依赖 -big datum framework general design&hdf datum storage policy colocation&AGGREGATION -what traditional datum processing software application&what traditional datum processing software application&依赖 -its&benefits& -Big data framework&4Vs namely&依赖 -framework&processing&依赖 -framework&datum&依赖 -framework&datum&依赖 -framework&datum&依赖 -( massive amount&datum&AGGREGATION -framework&processing&依赖 -framework&processing&依赖 -processing&datum&AGGREGATION -file system ( hdf )&Hadoop framework&依赖 -hdf&Hadoop technical framework&依赖 -distributed file system&Hadoop technical framework&AGGREGATION -It&scenario&依赖 -website user datum behavior datum storage&website user datum behavior datum storage&依赖 -its&architecture& -General design&design feature&依赖 -design feature&efficient working&依赖 -design feature&architecture&AGGREGATION -its&working& -General design&HDFS architecture The hdf&AGGREGATION -design feature&following&依赖 -General design&architecture&依赖 -hdf&namespace and storage&依赖 -hdf&distinction&依赖 -hdf&feature&依赖 -hdf&data replication&依赖 -data replication&system&依赖 -availability&system&AGGREGATION -data replication&availability&依赖 -single block&datum&AGGREGATION -client&2 other node&依赖 -single block&3 node&依赖 -client&block&依赖 -failure&‘ DataNode ’&依赖 -HDFS framework&framework&GENERALIZATION -primary component&meta-data&依赖 -meta-data&file&AGGREGATION -primary component&file&依赖 -‘ NameNode ’&HDFS framework&依赖 -master node&node&GENERALIZATION -It&master node&依赖 -creation , deletion , and replication&data block&AGGREGATION -node&actual datum&依赖 -node&hdf&依赖 -its&space& -number&replica&AGGREGATION -hdf consist&NameNodes and DataNodes&依赖 -hdf consist&NameNodes and DataNodes&依赖 -hdf consist&NameNodes and DataNodes&AGGREGATION -one NameNode&client&依赖 -single cluster&one NameNode&依赖 -one NameNode&data access&依赖 -DataNode&instruction&依赖 -DataNode&NameNode&依赖 -hdf&coherent file system&依赖 -external process&system&依赖 -external process&one unified system&依赖 -file&blocks& -number&application&依赖 -NameNode&change&依赖 -namenode insert&new file&依赖 -namenode insert&new file&依赖 -creation&new file&AGGREGATION -namenode insert&record&依赖 -namenode insert&creation&依赖 -namenode insert&creation&依赖 -namenode insert&new file&依赖 -new file&hdf&依赖 -namenode insert&creation&依赖 -namenode insert&record&依赖 -namenode insert&record&依赖 -robustness&failure&依赖 -3 common type&failure&AGGREGATION -robustness&failure&依赖 -robustness&3 common type&依赖 -Its&robustness& -robustness&3 common type&依赖 -datum&size 64MB&依赖 -block&size 64MB&AGGREGATION -datum&hdf&依赖 -datum&block&依赖 -hdf&stored datum&依赖 -failure&component&AGGREGATION -completeness&stored datum&AGGREGATION -case&failure&AGGREGATION -hdf&completeness&依赖 -DataNode periodically report&’ message&依赖 -DataNode periodically report&NameNode&依赖 -NameNode&procedure&依赖 -data balance mechanism&datum&依赖 -even distribution&datum&AGGREGATION -data balance mechanism&even distribution&依赖 -Ensures data balance&data balance mechanism&依赖 -data balance mechanism&DataNodes&依赖 -snapshot mechanism&file system&AGGREGATION -Data storage policy&5 storage policy&依赖 -One_SSD – Storage&single replica&AGGREGATION -All_SSD – Storage&replica&AGGREGATION -HDFS NameNode&NameNode&GENERALIZATION -HDFS NameNode&datanode&依赖 -layered storage select&layered data storage&依赖 -layered storage select&proper storage device&依赖 -four type&storage device&AGGREGATION -disk ( mechanical hard disk and ram_disk ( memory virtualization hard disk&ssd ( solid-state disk&依赖 -disk ( mechanical hard disk and ram_disk ( memory virtualization hard disk&ssd ( solid-state disk&依赖 -tag storage select&directory tag&依赖 -tag storage select&directory tag&依赖 -tag storage select&proper DataNode&依赖 -tag storage select&proper DataNode&依赖 -directory tag&data importance level&依赖 -node group storage stores key datum&reliable node group&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&reliable node group&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&reliable node group&依赖 -node group storage stores key datum&reliable node group&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&node group storage stores key datum&依赖 -node group storage stores key datum&reliable node group&依赖 -DataNode cluster&heterogeneous server&依赖 -node group storage stores key datum&reliable node group&依赖 -Colocation&associated data or datum&依赖 -storage&associated data or datum&AGGREGATION -great consumption&network resource&AGGREGATION -massive migration&datum&AGGREGATION -datum&massive datum and system performance&依赖 -processing speed&massive datum and system performance&AGGREGATION -benefit&colocation Reduces network bandwidth&AGGREGATION -strength&hdf&AGGREGATION -its&fault-tolerance& -its&ability& -distinct difference&fault-tolerance&依赖 -its&throughput& -Relevant resources HDFS Architecture Guide characteristic&hdfs big data huawei peer review contributions by&AGGREGATION -author ruth mare ruth&Kenyatta University&依赖 -She&computer and cloud network&依赖 -She&research and collaboration&依赖 -article&engineering education program&依赖 -Section&Program& -student member&engineering education program&AGGREGATION -article&student member&依赖 -next generation&engineer&AGGREGATION -community-generated pool&resource&AGGREGATION -Section&pool& -Slack community&Careers Legals Resources Blog Case Studies Content Library Solution Briefs Partners Changelog Support Docs Community Slack Help & Support Platform Status Pricing Section&依赖 -our&community& -Slack community&Slack Company&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt deleted file mode 100644 index a6ac402123412e806c28ad637e9d49b6d57598eb..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt +++ /dev/null @@ -1,294 +0,0 @@ -select_all Edge AppSpace -SolutionHub -Performance / CDN -Security -Virtual Waiting Room -A/B Testing -Search AppSpace -AppStack -Node.js Edge Hosting -RunStack -Containers -Serverless -gps_fixed Core Platform -Section Control Plane -Edge AppSpace -Adaptive Edge Engine (AEE) -Global Edge Network -Solutions -SaaS -PaaS & Hosting Providers -Edge App Hosting -Docs -Resources -Blog -Case Studies -Edge Content Library -Solution Briefs -Product Videos -Engineering Education -About Section -Partners -Changelog -Pricing -Platform -select_allEdge AppSpace -SolutionHub -Performance / CDN -Security -Virtual Waiting Room -A/B Testing -AppStack -Node.js Edge Hosting -RunStack -Containers -Serverless -Search AppSpace -gps_fixedCore Platform -Section Control Plane -Edge AppSpace -Adaptive Edge Engine (AEE) -Global Edge Network -Docs -Resources -Blog -Case Studies -Content Library -Solution Briefs -Changelog -Engineering Education -Partners -About Section -Pricing -Contact -Log In -Get Started -Key Design of HDFS Architecture -March 31, 2021 -HDFS (Hadoop Distributed File System) is a big data distributed file system storage by Apache. It is implemented within the Hadoop framework and it needs to have several features of design implemented to work effectively in processing, distributing, and storing big data. - -HDFS (Hadoop Distributed File System) is similar to other distributed file systems except for some key differences such as, fault-tolerance, high throughput, and ability to be deployed on low-cost hardware. - -Overview -This article will cover: - -Introduction to Big data framework -General design of HDFS architecture -Configuring HDFS data storage policies -Colocation and its benefits in HDFS -Introduction to Big data framework -Big data is data in sets that are of high volume and complexity beyond what traditional data processing software applications can deal with. Big data framework is characterized by 4Vs namely: - -Variety (data is of various forms and types) -Velocity (data processing speed is high) -Value (low data value density) -Volume (massive amount of data) -Apache Hadoop is among the frameworks that can do the processing of data with the characteristics described above. Within the Hadoop framework is the Hadoop Distributed File System (HDFS). - -HDFS is a distributed file system of the Hadoop technical framework that was developed based on the Google File System (GFS) and is used to manage files on multiple independent physical servers. - -It is applied in the following scenarios: - -Ecosystem data storage. -Website user data behavior data storage. -Meteorological data storage. -General design of HDFS architecture -The HDFS has design features of its architecture that enable its efficient working among which are the following: - -Federation storage: HDFS creates a distinction between the namespace and storage. The two are separated to create a block storage layer. - -High Availability: HDFS supports features such as data replication which enhances the availability of the system. A single block of data is replicated in 3 nodes, so that even if a single node fail, a client can access the block from 2 other nodes. - -Data can still be accessed normally even when a failure occurs on the ‘DataNode’ or ‘NameNode’. - -A ‘NameNode’ is a primary component within the HDFS framework that stores meta-data of files, manages and maintains ‘DataNodes’, and assigns them tasks. It is also known as the master node. - -A ‘DataNode’ is a node that stores the actual data within HDFS and does creation, deletion, and replication of data blocks. It also serves read and write requests for clients and is usually known as the slave node. - -Multiple access modes: Within HDFS data can be accessed through HTTP on an HTTP browser, Java API for applications, or any other command shells. - -Space reclamation: Space that had been released in HDFS can be reclaimed. This is implemented by a recycle bin mechanism where data that had been deleted can be restored from the recycle bin to occupy its initial space. The number of replicas can also be dynamically set. - -NameNode/DataNode in master/slave mode: HDFS consists of NameNodes and DataNodes that work in a master/slave architecture. A single cluster consists of only one NameNode which regulates data access by clients and manages the namespace within the file system. - -The DataNode receives instructions from the NameNode on when to create, delete, and replicate data blocks. - -Unified file system Namespace: HDFS is presented externally as a coherent file system. Any external process perceives the system as one unified system. - -Data replication: In HDFS, a file’s blocks are replicated for fault tolerance and the number of replicas can be specified by an application. This can be done at creation time but is subject to change at will. - -Metadata persistence: The HDFS NameNode stores the namespace. The NameNode consistently records every change that occurs in file system metadata in a transaction log file called the ‘EditLog’. - -Whenever a new file is created in HDFS, the NameNode inserts a record into the ‘EditLog’ indicating the creation of the new file. - -This information is also synchronized between the active and the standby NameNode periodically. - -Robustness: HDFS stores data reliably even when a failure occurs. - -Its robustness takes into account the 3 common types of failures: - -DataNode failure -NameNode failure -Network failure -Data organization: Data is stored by blocks of size 64MB in HDFS. - -HDFS Data Integrity Assurance: HDFS ensures the completeness of the stored data by implementing reliability processing in case of failure of each component. - -HDFS accomplishes this by doing the following: - -Reconstructing data replicas in invalid data disks - the DataNode periodically reports blocks’ messages to the NameNode, if one replica (block) fails, the NameNode will start the procedure to recover lost replicas. -Ensures data balance among DataNodes - the HDFS architecture is configured with the data balance mechanism, which ensures the even distribution of data among all DataNodes. -Ensures metadata reliability - the transaction log mechanism is used to operate metadata, which is stored on both active and standby NameNodes. The snapshot mechanism of the file system ensures that data can be recovered promptly when a misoperation occurs. -Provides the security mode - HDFS provides a unique security mode to prevent a fault from spreading when a DataNode or hard disk is faulty. -Data storage policy: HDFS supports 5 storage policies namely: - -Hot – Storage on DISK. -Warm – Storage on both DISK and ARCHIVE. -Cold – Storage on ARCHIVE. -One_SSD – Storage of a single replica on SSD and other replicas on DISK. -All_SSD – Storage of all replicas on SSD. -Configuring HDFS data storage policies -The HDFS NameNode automatically selects DataNodes to store data replicas by default. - -This can be done in the following scenarios: - -Layered storage -Select a proper storage device for layered data storage from multiple devices on a DataNode. - -The HDFS layered storage architecture provides four types of storage devices: - -RAM_DISK (memory virtualization hard disk) -DISK (mechanical hard disk) -ARCHIVE (high-density and low-cost storage media) -SSD (solid-state disk) -To formulate storage policies for different scenarios, the four types of storage devices are combined. - -Tag storage -Select a proper DataNode according to directory tags which indicates data importance levels. - -Node Group Storage -Stores key data in highly reliable node groups because the DataNode cluster uses heterogeneous servers. - -Colocation and its benefits in HDFS -Colocation is the storage of associated data or data that will be associated on the same storage node. - -This is implemented as a solution to the great consumption of network resources during a massive migration of data that affects the processing speed of massive data and system performance greatly. - -Benefits of colocation -Reduces network bandwidth and resource consumption. -Enhances easy and quick access to data. -To wrap up -As mentioned earlier HDFS is similar to other distributed file systems except for some distinct differences that serve as strengths of HDFS over other Distributed File Systems. - -These distinct differences are, its fault-tolerance, its high throughput, and its ability to be deployed on low-cost hardware and support large data sets. - -Happy learning. - -Relevant resources -HDFS Architecture Guide -Characteristics of HDFS -Big Data Huawei -Peer Review Contributions by: Srishilesh P S - -About the author - Ruth Mare -Ruth is an Undergraduate Computer Science student at Kenyatta University. She is passionate about Computer and Cloud networks, Information security, Machine Learning and Artificial Intelligence. She is open to research and collaborations. - -This article was contributed by a student member of Section's Engineering Education Program. Please report any errors or innaccuracies to enged@section.io. -Want to learn more about the EngEd Program? -Discover Section's community-generated pool of resources from the next generation of engineers. - -Learn more -QUICK LINKS // More Section offerings -Edge Modules -Varnish Cache -Nginx/Lua -SiteSpect -Optidash -Cloudinary -ModSecurity -SignalSciences -ThreatX -Wallarm -Snapt -PerimeterX -Radware Bot Manager -Content Security Policy -Virtual Waiting Room -Hugo -Node.js -Custom Workload -View All Modules -DevOps -Real Time Metrics -Log Management -Real User Monitoring -Instant Global Deployments -Developer PoP -Instant Cache Purge -Managed SSL Certificates -APIs -Endpoints -Global Edge Network -Custom Edge Network -Private Edge Network -Origin PoP -Performance & Scalability -Dynamic Content Caching -Static Asset Caching -HTML Streaming -Anonymous Page Caching -Image Optimization -Mobile Optimization -Virtual Waiting Room -HTTP/2 -Edge Delivery -Load Balancing -Maintenance Pages -Anycast DNS Hosting -SSL Certificates -Static Site Deployment -Application Security -Web Application Firewall -IP Blocking -SSL Certificates -DDoS Mitigation -Bad Bot Management -Content Security Policy -Use Cases -SaaS -PaaS & Hosting Providers -Edge App Hosting -Enterprise -E-Commerce -Gaming -IoT/IIoT -BigCommerce -Magento -WordPress -Drupal -Join our Slack community -Add to Slack -Company -About -Careers -Legals -Resources -Blog -Case Studies -Content Library -Solution Briefs -Partners -Changelog -Support -Docs -Community Slack -Help & Support -Platform Status -Pricing -Section supports many open source projects including: - -varnish cache logo -cloud native computing foundation logo -the linux foundation logo -lf edge logo diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls deleted file mode 100644 index f76135c40e11237c2acb283adfb44aac93ba9584..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt deleted file mode 100644 index 54c71d366613fecf47341b3f418f9c7cfbc18921..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt +++ /dev/null @@ -1,527 +0,0 @@ -1 Introduction ....................................................................................................................... 3 2 Assumptions and Goals ..................................................................................................... 3 2.1 Hardware Failure&3 2.2 Streaming Data Access&依赖 -1 Introduction ....................................................................................................................... 3 2 Assumptions and Goals ..................................................................................................... 3 2.1 Hardware Failure&3 2.3 Large Data set&依赖 -Dhruba Borthakur Table&contents&AGGREGATION -persistence&file system metadata ......................................................................... 7 7&AGGREGATION -Heartbeats&re-replication ....................................................... 8 8.2 cluster rebalancing ......................................................................................................&依赖 -........................................................................................................... 8 copyright � 2005&Apache Software Foundation&依赖 -Blocks&Hadoop Distributed File System&依赖 -Blocks&.................................................................................................................. 9 9.2 staging ........................................................................................................................ 10 9.3 pipelining .................................................................................................................... 10 10 accessibility .................................................................................................................. 10 10.1 dfsshell ................................................................................................................... 11 10.2 dfsadmin ................................................................................................................ 11 10.3 browser interface ...................................................................................................... 11 11 space reclamation ........................................................................................................ 11 11.1 file deletes&依赖 -Blocks&..................................................................................................................... 12&依赖 -8.4 metadata disk failure .................................................................................................. 9 8.5 snapshot ...................................................................................................................... 9 9 data organization ............................................................................................................. 9 9.1 datum&8.4 metadata disk failure .................................................................................................. 9 8.5 snapshot ...................................................................................................................... 9 9 data organization ............................................................................................................. 9 9.1 datum&依赖 -It&many similarity&依赖 -It&distributed file system&依赖 -application&large dataset&依赖 -hdf&few POSIX requirement&依赖 -hdf&infrastructure&依赖 -hdf&open source web crawler Apache Nutch project&依赖 -part&Lucene Apache Project&AGGREGATION -part&Hadoop Project&AGGREGATION -hdf&Hadoop Project&依赖 -Hardware Failure Hardware Failure&exception&依赖 -entire HDFS file system&server machine&依赖 -server machine&file system datum&依赖 -hundreds or thousand&server machine&AGGREGATION -server machine&piece&依赖 -entire HDFS file system&hundreds or thousand&依赖 -piece&file system datum&AGGREGATION -huge number&component&AGGREGATION -non-trivial probability&failure&AGGREGATION -component&failure&依赖 -component&non-trivial probability&依赖 -component&hdf&AGGREGATION -detection&hdf&依赖 -core architectural goal&hdf&AGGREGATION -detection&hdf&依赖 -detection&fault&AGGREGATION -their&data& -hdf&batch processing&依赖 -emphasis&latency&依赖 -latency&data access&AGGREGATION -emphasis&data access&依赖 -throughput&data access&AGGREGATION -emphasis&data access&依赖 -POSIX&many hard requirement&依赖 -Large Data Sets application&large data set&依赖 -hundred&node&AGGREGATION -It&ten&依赖 -It&file&依赖 -It&million&依赖 -It&single cluster&依赖 -million&file&AGGREGATION -ten&million&AGGREGATION -design page 3 copyright � 2005&Apache Software Foundation&依赖 -Most HDFS application&write-once-read-many access model&依赖 -Most HDFS application&file&依赖 -assumption&data coherency issue&实现 -Map-Reduce application&model&依赖 -Map-Reduce application&application&GENERALIZATION -size&data set&AGGREGATION -network congestion and increase overall throughput&system&AGGREGATION -Portability&portable&依赖 -Portability&such a way&依赖 -it&one platform&依赖 -platform&choice&AGGREGATION -widespread adoption&hdf&AGGREGATION -large set&application&AGGREGATION -namenode and datanode hdfs&master/slave architecture&依赖 -master/slave architecture&architecture&GENERALIZATION -HDFS cluster&master server&依赖 -master server&filesystem namespace&依赖 -HDFS cluster&cluster&GENERALIZATION -HDFS cluster&single Namenode&依赖 -number and one&addition&依赖 -number and one&addition&依赖 -number and one&addition&依赖 -number and one&addition&依赖 -number and one&Datanodes&AGGREGATION -cluster&storage&依赖 -hdf&a file system namespace&依赖 -set&Datanodes&AGGREGATION -file&one or more block&依赖 -block&set&依赖 -block&Datanodes&依赖 -etc.&files and directory&AGGREGATION -Namenode&filesystem namespace operation&依赖 -mapping&block&AGGREGATION -It&mapping&依赖 -It&Datanodes&依赖 -Datanodes&block creation&依赖 -block creation&creation&GENERALIZATION -Datanodes&instruction&依赖 -Datanodes&Namenode&依赖 -Namenode and Datanode&software&依赖 -piece&software&AGGREGATION -machine&Java&依赖 -machine&Namenode&依赖 -Usage&portable Java language&AGGREGATION -wide range&machine&AGGREGATION -dedicated machine&machine&GENERALIZATION -Namenode software&software&GENERALIZATION -typical deployment&dedicated machine&依赖 -dedicated machine&Namenode software&依赖 -one instance&Datanode software&AGGREGATION -Datanode software&software&GENERALIZATION -design page 4 copyright � 2005&Apache Software Foundation&依赖 -existence&single Namenode&AGGREGATION -existence&architecture&实现 -existence&system&实现 -architecture&system&AGGREGATION -Namenode&HDFS metada&依赖 -system&flows&依赖 -system&such a way&依赖 -user datum&Namenode&依赖 -File System Namespace hdf&traditional hierarchical file organization&依赖 -user&directory&依赖 -user&directory and store file&依赖 -file system namespace hierarchy&most other existing file system&依赖 -One&file&依赖 -hdf&user quota&实现 -hdf&hard link&依赖 -HDFS architecture&feature&实现 -HDFS architecture&architecture&GENERALIZATION -Namenode&file system namespace&依赖 -change&Namenode&依赖 -number&replica&AGGREGATION -application&number&依赖 -replica&file&AGGREGATION -number&file&AGGREGATION -application&file&依赖 -copy&file&AGGREGATION -number©&AGGREGATION -replication factor&file&AGGREGATION -information&Namenode&依赖 -It&file&依赖 -It&sequence&依赖 -sequence&block&AGGREGATION -It&block&依赖 -Blocks&fault tolerance&依赖 -block size and replication factor&file&依赖 -application&file&依赖 -application&replica&依赖 -replication&block&AGGREGATION -Namenode&replication&依赖 -Namenode&block&依赖 -Namenode&decision&依赖 -receipt&heartbeat&AGGREGATION -list&block&AGGREGATION -Blockreport&list&依赖 -Blockreport&Datanode&依赖 -Blockreport&block&依赖 -selection&placement&AGGREGATION -placement&replica&AGGREGATION -feature&most other distributed file system&依赖 -feature&hdf&依赖 -lot&tuning and experience&AGGREGATION -feature&lot&依赖 -feature&tuning and experience&依赖 -purpose&rack-aware replica placement&AGGREGATION -purpose&data reliability&依赖 -design page 5 copyright � 2005&Apache Software Foundation&依赖 -implementation&direction&依赖 -implementation&direction&依赖 -short-term goal&it&依赖 -its&behavior& -hdf&cluster&依赖 -hdf&computer&依赖 -cluster&computer&AGGREGATION -Datanode&rack&依赖 -Datanode&startup time&依赖 -Namenode&rack id&AGGREGATION -rack identity&machine&AGGREGATION -simple but non-optimal policy&replica&依赖 -entire rack&multiple rack&依赖 -entire rack&use&依赖 -use&bandwidth&AGGREGATION -component failure&failure&GENERALIZATION -policy&cluster&依赖 -it&load&依赖 -policy&replica&依赖 -it&component failure&依赖 -write&block&依赖 -policy&cost&依赖 -HDFS.s placement policy&one replica&依赖 -inter-rack write traffic&inter-rack write traffic&依赖 -policy cut&performance&依赖 -chance&rack failure&AGGREGATION -policy&impact datum reliability and availability guarantee&依赖 -it&aggregate network bandwidth&依赖 -datum&three&依赖 -datum&two unique rack&依赖 -replica&rack&依赖 -other one third&replica&AGGREGATION -two third&replica&AGGREGATION -One third&replica&AGGREGATION -other one third&rack&依赖 -policy&performance&依赖 -implementation&above policy&AGGREGATION -Replica Selection hdf&read request&依赖 -Replica Selection hdf&replica&依赖 -HDFS cluster&multiple data center&依赖 -replica&remote replica&依赖 -Namenode&special state&依赖 -Namenode&special state&依赖 -Replication&data block&AGGREGATION -design page 6 copyright � 2005&Apache Software Foundation&依赖 -Namenode&Heartbeat The Hadoop Distributed File System&依赖 -Blockreport&data block&依赖 -Blockreport&Namenode&依赖 -list&data block&AGGREGATION -a datanode report&Namenode&依赖 -a datanode report&Namenode&依赖 -Blockreport&a datanode report&依赖 -block&replica&依赖 -block&specified minimum number&依赖 -specified minimum number&replica&AGGREGATION -data block&block&GENERALIZATION -replica&data block&AGGREGATION -minimum number&replica&AGGREGATION -configurable percentage&safely-replicated data block&AGGREGATION -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -namenode exit&Safemode state&依赖 -It&list&依赖 -It&data block&依赖 -It&)&依赖 -specified number&replica&AGGREGATION -Namenode&block&依赖 -Namenode&other datanode&依赖 -HDFS namespace&Namenode&依赖 -Persistence&File System Metadata&AGGREGATION -Namenode&transaction log&依赖 -Namenode&EditLog&依赖 -Namenode&system metada&依赖 -Namenode&file&依赖 -Namenode&local file system&依赖 -its&system& -entire file system namespace&FsImage&依赖 -entire file system namespace&file&依赖 -Namenode.s local file system&local file system&GENERALIZATION -FsImage&Namenode.s local file system&依赖 -Namenode&memory&依赖 -Namenode&entire file system namespace and file blockmap&依赖 -image&entire file system namespace and file blockmap&AGGREGATION -large number&files and directory&AGGREGATION -Namenode machine&machine&GENERALIZATION -in-memory representation&FsImage&AGGREGATION -it&FsImage and EditLog&依赖 -it&disk&依赖 -It&old EditLog&依赖 -transaction&persistent FsImage&依赖 -its&transactions& -checkpoint¤t implementation&依赖 -Work&periodic checkpointing&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -Datanode&knowledge&依赖 -Datanode&HDFS file&依赖 -It&HDFS datum&依赖 -It&block&依赖 -It&HDFS datum&依赖 -block&HDFS datum&AGGREGATION -It&block&依赖 -It&block&依赖 -It&HDFS datum&依赖 -Datanode&file&依赖 -Datanode&same directory&依赖 -optimal number&file&AGGREGATION -it&heuristic&依赖 -It&subdirectory&依赖 -It&local file&依赖 -local file system&single directory&依赖 -local file system&huge number&依赖 -huge number&file&AGGREGATION -It&same directory&依赖 -list&HDFS data block&AGGREGATION -it&local file system&依赖 -Communication Protocol All communication protocol&TCP/IP protocol&依赖 -top&TCP/IP protocol&AGGREGATION -Communication Protocol All communication protocol&top&依赖 -client&Namenode machine&依赖 -client&connection&依赖 -client&well-defined and configurable port&依赖 -It&Namenode&依赖 -It&ClientProtocol&依赖 -Datanodes&DatanodeProtocol&依赖 -Datanodes&Namenode&依赖 -( rpc ) abstraction&ClientProtocol&依赖 -Namenode&RPC&依赖 -Namenode&design&依赖 -It&RPC request&依赖 -robustness primary objective&hdf&AGGREGATION -robustness primary objective&datum&依赖 -presence&failure&AGGREGATION -three type&common failure&AGGREGATION -Data Disk Failure&Namenode&依赖 -Data Disk Failure&heartbeat message&依赖 -network partition&subset&依赖 -network partition&Datanodes&依赖 -subset&Datanodes&AGGREGATION -lack&heartbeat message&AGGREGATION -namenode mark&datanode&依赖 -namenode mark&datanode&依赖 -datum&hdf&依赖 -replication factor&block&AGGREGATION -their&value& -Namenode&block&依赖 -increase&replication factor&AGGREGATION -HDFS architecture&data rebalancing scheme&依赖 -free space&certain threshold&依赖 -free space&certain threshold&依赖 -sudden high demand&other datum&依赖 -sudden high demand&creation&依赖 -sudden high demand&cluster&依赖 -creation&additional replicas and rebalancing&AGGREGATION -additional replicas and rebalancing&other datum&AGGREGATION -sudden high demand&additional replicas and rebalancing&依赖 -sudden high demand&additional replicas and rebalancing&依赖 -sudden high demand&cluster&依赖 -sudden high demand&other datum&依赖 -sudden high demand&creation&依赖 -type&scheme&AGGREGATION -block&datum&AGGREGATION -design page 8 copyright � 2005&Apache Software Foundation&依赖 -HDFS client&HDFS file&实现 -contents&HDFS file&AGGREGATION -HDFS file&file&GENERALIZATION -HDFS client&checksum checking&实现 -HDFS client&contents&实现 -HDFS client&client&GENERALIZATION -it&checksum&依赖 -it&block&依赖 -it&block&依赖 -it&checksum&依赖 -it&checksum&依赖 -it&block&依赖 -it&block&依赖 -checksum&block&AGGREGATION -client&HDFS file&依赖 -it&checksum&依赖 -file contents&contents&GENERALIZATION -it&checksum&依赖 -client&file contents&依赖 -replica&block&AGGREGATION -Datanode&replica&依赖 -Datanode&block&依赖 -central data structure&hdf&AGGREGATION -Metadata Disk Failure The FsImage&hdf&依赖 -corruption&file&AGGREGATION -corruption&entire cluster&依赖 -multiple copy&FsImage and EditLog&AGGREGATION -update&updated synchronously&依赖 -synchronous update&rate&依赖 -rate&namespace transaction&AGGREGATION -synchronous update&second&依赖 -synchronous update&namespace transaction&依赖 -synchronous update&multiple EditLog&AGGREGATION -Namenode&latest consistent FsImage and EditLog to use&依赖 -Namenode machine&HDFS cluster&依赖 -Namenode machine&failure&依赖 -single point&failure&AGGREGATION -automatic restart and failover&Namenode software&AGGREGATION -particular instant&time&AGGREGATION -copy&datum&AGGREGATION -snapshot snapshot©&依赖 -snapshot snapshot&support&依赖 -snapshot snapshot&datum&依赖 -One usage&snapshot-feature&AGGREGATION -One usage&corrupted cluster&依赖 -HDFS current&snapshot&依赖 -they&datum one or more time&依赖 -application&datum&依赖 -hdf&write-once-read-many semantics&依赖 -hdf&file&依赖 -chunk&different datanode&依赖 -HDFS client&file datum&依赖 -HDFS client&temporary local file&依赖 -HDFS client&fact&依赖 -local file&HDFS block size&依赖 -client contact&Namenode&依赖 -client contact&Namenode&依赖 -local file&data worth&依赖 -client contact&Namenode&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -identity&datanode (&AGGREGATION -Namenode&identity&依赖 -Namenode&datanode (&依赖 -Namenode&client request&依赖 -client&datum&依赖 -client&block&依赖 -client&datum&依赖 -client&datum&依赖 -client&block&依赖 -client&block&依赖 -un-flushed datum&Datanode&依赖 -client&Namenode&依赖 -Namenode&persistent store&依赖 -Namenode&point&依赖 -Namenode&file creation operation&依赖 -careful consideration&target application&AGGREGATION -above approach&target application&依赖 -above approach&careful consideration&依赖 -application&streaming write&依赖 -application&file&依赖 -network speed&writes&依赖 -client&client side buffering&依赖 -client&remote file&依赖 -network speed&network impact throughput&依赖 -e.g. AFS&client side caching&依赖 -e.g. AFS&earlier distribute file system&依赖 -higher performance&data upload&AGGREGATION -POSIX requirement&data upload&依赖 -POSIX requirement&higher performance&依赖 -client&datum&依赖 -client&HDFS file&依赖 -its&data& -datum&local file&依赖 -HDFS file&replication factor&依赖 -replication factor&three&AGGREGATION -HDFS file&three&依赖 -client&list&依赖 -local file&block&依赖 -local file&user datum&依赖 -list&Datanodes&AGGREGATION -block&user datum&AGGREGATION -client&Namenode&依赖 -Datanodes&replica&依赖 -list&Datanodes&依赖 -Datanodes&block&依赖 -client&first Datanode&依赖 -client&data block&依赖 -its&repository& -first Datanode&datum&依赖 -portion&data block&AGGREGATION -second Datanode&data block&依赖 -second Datanode&portion&依赖 -third Datanode&datum&依赖 -third Datanode&local repository&依赖 -it&next one&依赖 -Datanode&pipeline&依赖 -it&pipeline&依赖 -Datanode&datum&依赖 -it&same time&依赖 -Datanode&previous one&依赖 -datum&one Datanode&依赖 -datum&next&依赖 -Accessibility hdf&application&依赖 -Accessibility hdf&many different way&依赖 -design page 10 copyright � 2005&Apache Software Foundation&依赖 -DFSShell hdf&user datum&依赖 -form&files and directory&AGGREGATION -DFSShell&user interact&依赖 -syntax&command set&AGGREGATION -application&language&依赖 -directory&/ foodir&依赖 -command syntax&application&依赖 -browser interface a typical hdf&web-server&依赖 -hdf namespace and view contents&HDFS file&AGGREGATION -file&user&依赖 -it&hdf&依赖 -hdf&/ trash directory&依赖 -hdf&it&依赖 -hdf&file&依赖 -design page 11 copyright � 2005&Apache Software Foundation&依赖 -file&configurable amount&依赖 -configurable amount&time&AGGREGATION -file&/ trash&依赖 -file&time&依赖 -expiry&life&AGGREGATION -Namenode&/ trash&依赖 -Namenode&file&依赖 -Namenode&HDFS namespace&依赖 -Namenode&file&依赖 -its&life& -deletion&block&依赖 -deletion&file&AGGREGATION -time&corresponding increase&AGGREGATION -user&file&依赖 -it&/ trash directory&依赖 -user&file&依赖 -he/she&that&依赖 -he/she&/ trash directory&依赖 -/ trash directory&file&依赖 -/ trash directory&latest copy&依赖 -latest copy&file&AGGREGATION -hdf&directory&依赖 -/ trash directory&one special feature&依赖 -hdf&policy&依赖 -hdf&file&依赖 -current default policy&file&依赖 -policy&future&依赖 -policy&defined interface&依赖 -Namenode&excess replica&依赖 -next heartbeat transfer&information&依赖 -corresponding free space&cluster&依赖 -Datanode&corresponding block&依赖 -completion&setReplication apus&AGGREGATION -appearance&free space&AGGREGATION -hdf source code&Hadoop Distributed File System&依赖 -design page 12 copyright � 2005&Apache Software Foundation&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt deleted file mode 100644 index 5cbf2094b3a61046925de9fe63866eea634391a5..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt +++ /dev/null @@ -1,380 +0,0 @@ -The Hadoop Distributed File System: -Architecture and Design -by Dhruba Borthakur -Table of contents -1 Introduction .......................................................................................................................3 -2 Assumptions and Goals .....................................................................................................3 -2.1 Hardware Failure........................................................................................................... 3 -2.2 Streaming Data Access .................................................................................................3 -2.3 Large Data Sets .............................................................................................................3 -2.4 Simple Coherency Model ............................................................................................. 3 -2.5 Moving computation is cheaper than moving data .......................................................4 -2.6 Portability across Heterogeneous Hardware and Software Platforms ..........................4 -3 Namenode and Datanode .................................................................................................. 4 -4 The File System Namespace .............................................................................................5 -5 Data Replication ................................................................................................................5 -5.1 Replica Placement . The First Baby Steps ....................................................................5 -5.2 Replica Selection .......................................................................................................... 6 -5.3 SafeMode ......................................................................................................................6 -6 The Persistence of File System Metadata ......................................................................... 7 -7 The Communication Protocol ........................................................................................... 8 -8 Robustness ........................................................................................................................ 8 -8.1 Data Disk Failure, Heartbeats and Re-Replication .......................................................8 -8.2 Cluster Rebalancing ......................................................................................................8 -8.3 Data Correctness ...........................................................................................................8 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -8.4 Metadata Disk Failure .................................................................................................. 9 -8.5 Snapshots ......................................................................................................................9 -9 Data Organization ............................................................................................................. 9 -9.1 Data Blocks .................................................................................................................. 9 -9.2 Staging ........................................................................................................................10 -9.3 Pipelining ....................................................................................................................10 -10 Accessibility .................................................................................................................. 10 -10.1 DFSShell ...................................................................................................................11 -10.2 DFSAdmin ................................................................................................................11 -10.3 Browser Interface ......................................................................................................11 -11 Space Reclamation ........................................................................................................ 11 -11.1 File Deletes and Undelete ......................................................................................... 11 -11.2 Decrease Replication Factor ..................................................................................... 12 -12 References ..................................................................................................................... 12 -The Hadoop Distributed File System: Architecture and Design -Page 2 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -1. Introduction -The Hadoop File System (HDFS) is as a distributed file system running on commodity -hardware. It has many similarities with existing distributed file systems. However, the -differences from other distributed file systems are significant. HDFS is highly fault-tolerant -and can be deployed on low-cost hardware. HDFS provides high throughput access to -application data and is suitable for applications that have large datasets. HDFS relaxes a few -POSIX requirements to enable streaming access to file system data. HDFS was originally -built as infrastructure for the open source web crawler Apache Nutch project. HDFS is part -of the Hadoop Project, which is part of the Lucene Apache Project. The Project URL is here. -2. Assumptions and Goals -2.1. Hardware Failure -Hardware Failure is the norm rather than the exception. The entire HDFS file system may -consist of hundreds or thousands of server machines that stores pieces of file system data. -The fact that there are a huge number of components and that each component has a -non-trivial probability of failure means that some component of HDFS is always -non-functional. Therefore, detection of faults and automatically recovering quickly from -those faults are core architectural goals of HDFS. -2.2. Streaming Data Access -Applications that run on HDFS need streaming access to their data sets. They are not general -purpose applications that typically run on a general purpose file system. HDFS is designed -more for batch processing rather than interactive use by users. The emphasis is on throughput -of data access rather than latency of data access. POSIX imposes many hard requirements -that are not needed for applications that are targeted for HDFS. POSIX semantics in a few -key areas have been traded off to further enhance data throughout rates. -2.3. Large Data Sets -Applications that run on HDFS have large data sets. This means that a typical file in HDFS is -gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide -high aggregate data bandwidth and should scale to hundreds of nodes in a single cluster. It -should support tens of millions of files in a single cluster. -2.4. Simple Coherency Model -The Hadoop Distributed File System: Architecture and Design -Page 3 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -Most HDFS applications need write-once-read-many access model for files. A file once -created, written and closed need not be changed. This assumption simplifies data coherency -issues and enables high throughout data access. A Map-Reduce application or a -Web-Crawler application fits perfectly with this model. There is a plan to support -appending-writes to a file in future. -2.5. Moving computation is cheaper than moving data -A computation requested by an application is most optimal if the computation can be done -near where the data is located. This is especially true when the size of the data set is huge. -This eliminates network congestion and increase overall throughput of the system. The -assumption is that it is often better to migrate the computation closer to where the data is -located rather than moving the data to where the application is running. HDFS provides -interfaces for applications to move themselves closer to where the data is located. -2.6. Portability across Heterogeneous Hardware and Software Platforms -HDFS should be designed in such a way that it is easily portable from one platform to -another. This facilitates widespread adoption of HDFS as a platform of choice for a large set -of applications. -3. Namenode and Datanode -HDFS has a master/slave architecture. An HDFS cluster consists of a single Namenode, a -master server that manages the filesystem namespace and regulates access to files by clients. -In addition, there are a number of Datanodes, one per node in the cluster, which manage -storage attached to the nodes that they run on. HDFS exposes a file system namespace and -allows user data to be stored in files. Internally, a file is split into one or more blocks and -these blocks are stored in a set of Datanodes. The Namenode makes filesystem namespace -operations like opening, closing, renaming etc. of files and directories. It also determines the -mapping of blocks to Datanodes. The Datanodes are responsible for serving read and write -requests from filesystem clients. The Datanodes also perform block creation, deletion, and -replication upon instruction from the Namenode. -The Namenode and Datanode are pieces of software that run on commodity machines. These -machines are typically commodity Linux machines. HDFS is built using the Java language; -any machine that support Java can run the Namenode or the Datanode. Usage of the highly -portable Java language means that HDFS can be deployed on a wide range of machines. A -typical deployment could have a dedicated machine that runs only the Namenode software. -Each of the other machines in the cluster runs one instance of the Datanode software. The -The Hadoop Distributed File System: Architecture and Design -Page 4 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -architecture does not preclude running multiple Datanodes on the same machine but in a -real-deployment that is never the case. -The existence of a single Namenode in a cluster greatly simplifies the architecture of the -system. The Namenode is the arbitrator and repository for all HDFS metadata. The system is -designed in such a way that user data never flows through the Namenode. -4. The File System Namespace -HDFS supports a traditional hierarchical file organization. A user or an application can create -directories and store files inside these directories. The file system namespace hierarchy is -similar to most other existing file systems. One can create and remove files, move a file from -one directory to another, or rename a file. HDFS does not yet implement user quotas and -access permissions. HDFS does not support hard links and soft links. However, the HDFS -architecture does not preclude implementing these features at a later time. -The Namenode maintains the file system namespace. Any change to the file system -namespace and properties are recorded by the Namenode. An application can specify the -number of replicas of a file that should be maintained by HDFS. The number of copies of a -file is called the replication factor of that file. This information is stored by the Namenode. -5. Data Replication -HDFS is designed to reliably store very large files across machines in a large cluster. It stores -each file as a sequence of blocks; all blocks in a file except the last block are the same size. -Blocks belonging to a file are replicated for fault tolerance. The block size and replication -factor are configurable per file. Files in HDFS are write-once and have strictly one writer at -any time. An application can specify the number of replicas of a file. The replication factor -can be specified at file creation time and can be changed later. -The Namenode makes all decisions regarding replication of blocks. It periodically receives -Heartbeat and a Blockreport from each of the Datanodes in the cluster. A receipt of a -heartbeat implies that the Datanode is in good health and is serving data as desired. A -Blockreport contains a list of all blocks on that Datanode. -5.1. Replica Placement . The First Baby Steps -The selection of placement of replicas is critical to HDFS reliability and performance. This -feature distinguishes HDFS from most other distributed file systems. This is a feature that -needs lots of tuning and experience. The purpose of a rack-aware replica placement is to -improve data reliability, availability, and network bandwidth utilization. The current -The Hadoop Distributed File System: Architecture and Design -Page 5 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -implementation for the replica placement policy is a first effort in this direction. The -short-term goals of implementing this policy are to validate it on production systems, learn -more about its behavior and build a foundation to test and research more sophisticated -policies in the future. -HDFS runs on a cluster of computers that spread across many racks. Communication -between two nodes on different racks has to go through switches. In most cases, network -bandwidth between two machines in the same rack is greater than network bandwidth -between two machines on different racks. -At startup time, each Datanode determines the rack it belongs to and notifies the Namenode -of the rack id upon registration. HDFS provides APIs to facilitate pluggable modules that can -be used to determine the rack identity of a machine. A simple but non-optimal policy is to -place replicas across racks. This prevents losing data when an entire rack fails and allows use -of bandwidth from multiple racks when reading data. This policy evenly distributes replicas -in the cluster and thus makes it easy to balance load on component failure. However, this -policy increases the cost of writes because a write needs to transfer blocks to multiple racks. -For the most common case when the replica factor is three, HDFS.s placement policy is to -place one replica on the local node, place another replica on a different node at the local rack, -and place the last replica on different node at a different rack. This policy cuts the inter-rack -write traffic and improves write performance. The chance of rack failure is far less than that -of node failure; this policy does not impact data reliability and availability guarantees. But it -reduces the aggregate network bandwidth when reading data since a block is placed in only -two unique racks rather than three. The replicas of a file do not evenly distribute across the -racks. One third of replicas are on one node, two thirds of the replicas are on one rack; the -other one third of replicas is evenly distributed across all the remaining racks. This policy -improves write performance while not impacting data reliability or read performance. -The implementation of the above policy is work-in-progress. -5.2. Replica Selection -HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists -a replica on the same rack as the reader node, then that replica is preferred to satisfy the read -request. If a HDFS cluster spans multiple data centers, then a replica that is resident in the -local data center is preferred over remote replicas. -5.3. SafeMode -On startup, the Namenode enters a special state called Safemode. Replication of data blocks -does not occur when the Namenode is in Safemode state. The Namenode receives Heartbeat -The Hadoop Distributed File System: Architecture and Design -Page 6 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -and Blockreport from the Datanodes. A Blockreport contains the list of data blocks that a -Datanode reports to the Namenode. Each block has a specified minimum number of replicas. -A block is considered safely-replicated when the minimum number of replicas of that data -block has checked in with the Namenode. When a configurable percentage of -safely-replicated data blocks checks in with the Namenode (plus an additional 30 seconds), -the Namenode exits the Safemode state. It then determines the list of data blocks (if any) that -have fewer than the specified number of replicas. The Namenode then replicates these blocks -to other Datanodes. -6. The Persistence of File System Metadata -The HDFS namespace is stored by the Namenode. The Namenode uses a transaction log -called the EditLog to persistently record every change that occurs to file system metadata. -For example, creating a new file in HDFS causes the Namenode to insert a record into the -EditLog indicating this change. Similarly, changing the replication factor of a file causes a -new record to be inserted into the EditLog. The Namenode uses a file in its local file system -to store the Edit Log. The entire file system namespace, the mapping of blocks to files and -filesystem properties are stored in a file called the FsImage. The FsImage is a file in the -Namenode.s local file system too. -The Namenode has an image of the entire file system namespace and file Blockmap in -memory. This metadata is designed to be compact, so that a 4GB memory on the Namenode -machine is plenty to support a very large number of files and directories. When the -Namenode starts up, it reads the FsImage and EditLog from disk, applies all the transactions -from the EditLog into the in-memory representation of the FsImage and then flushes out this -new metadata into a new FsImage on disk. It can then truncate the old EditLog because its -transactions have been applied to the persistent FsImage. This process is called a checkpoint. -In the current implementation, a checkpoint occurs when the Namenode starts up. Work is in -progress to support periodic checkpointing in the near future. -The Datanode stores HDFS data into files in its local file system. The Datanode has no -knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local -file system. The Datanode does not create all files in the same directory. Instead, it uses a -heuristic to determine the optimal number of files per directory. It creates subdirectories -appropriately. It is not optimal to create all local files in the same directory because the local -file system might not be able to efficiently support a huge number of files in a single -directory. When a Datanode starts up, it scans through its local file system, generates a list of -all HDFS data blocks that correspond to each of these local files and sends this report to the -Namenode. This report is called the Blockreport. -The Hadoop Distributed File System: Architecture and Design -Page 7 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -7. The Communication Protocol -All communication protocols are layered on top of the TCP/IP protocol. A client establishes -a connection to a well-defined and configurable port on the Namenode machine. It talks the -ClientProtocol with the Namenode. The Datanodes talk to the Namenode using the -DatanodeProtocol. The details on these protocols will be explained later on. A Remote -Procedure Call (RPC) abstraction wraps the ClientProtocol and the DatanodeProtocol. By -design, the Namenode never initiates an RPC. It responds to RPC requests issued by a -Datanode or a client. -8. Robustness -The primary objective of HDFS is to store data reliably even in the presence of failures. The -three types of common failures are Namenode failures, Datanode failures and network -partitions. -8.1. Data Disk Failure, Heartbeats and Re-Replication -A Datanode sends a heartbeat message to the Namenode periodically. A network partition -can cause a subset of Datanodes to lose connectivity with the Namenode. The Namenode -detects this condition be a lack of heartbeat message. The Namenode marks these Datanodes -as dead and does not forward any new IO requests to these Datanodes. The data that was -residing on those Datanodes are not available to HDFS any more. This may cause the -replication factor of some blocks to fall below their specified value. The Namenode -determines all the blocks that need to be replicated and starts replicating them to other -Datanodes. The necessity for re-replication may arise due to many reasons: a Datanode -becoming unavailable, a corrupt replica, a bad disk on the Datanode or an increase of the -replication factor of a file. -8.2. Cluster Rebalancing -The HDFS architecture is compatible with data rebalancing schemes. It is possible that data -may move automatically from one Datanode to another if the free space on a Datanode falls -below a certain threshold. Also, a sudden high demand for a particular file can dynamically -cause creation of additional replicas and rebalancing of other data in the cluster. These types -of rebalancing schemes are not yet implemented. -8.3. Data Correctness -It is possible that a block of data fetched from a Datanode is corrupted. This corruption can -The Hadoop Distributed File System: Architecture and Design -Page 8 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -occur because of faults in the storage device, a bad network or buggy software. The HDFS -client implements checksum checking on the contents of a HDFS file. When a client creates a -HDFS file, it computes a checksum of each block on the file and stores these checksums in a -separate hidden file in the same HDFS namespace. When a client retrieves file contents it -verifies that the data it received from a Datanode satisfies the checksum stored in the -checksum file. If not, then the client can opt to retrieve that block from another Datanode that -has a replica of that block. -8.4. Metadata Disk Failure -The FsImage and the EditLog are central data structures of HDFS. A corruption of these files -can cause the entire cluster to be non-functional. For this reason, the Namenode can be -configured to support multiple copies of the FsImage and EditLog. Any update to either the -FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. -This synchronous updating of multiple EditLog may degrade the rate of namespace -transactions per second that a Namenode can support. But this degradation is acceptable -because HDFS applications are very data intensive in nature; they are not metadata intensive. -A Namenode, when it restarts, selects the latest consistent FsImage and EditLog to use. -The Namenode machine is a single point of failure for the HDFS cluster. If a Namenode -machine fails, manual intervention is necessary. Currently, automatic restart and failover of -the Namenode software to another machine is not supported. -8.5. Snapshots -Snapshots support storing a copy of data at a particular instant of time. One usage of the -snapshot-feature may be to roll back a corrupted cluster to a previously known good point in -time. HDFS current does not support snapshots but it will be supported it in future release. -9. Data Organization -9.1. Data Blocks -HDFS is designed to support large files. Applications that are compatible with HDFS are -those that deal with large data sets. These applications write the data only once; they read the -data one or more times and require that reads are satisfied at streaming speeds. HDFS -supports write-once-read-many semantics on files. A typical block size used by HDFS is 64 -MB. Thus, a HDFS file is chopped up into 128MB chunks, and each chunk could reside in -different Datanodes. -The Hadoop Distributed File System: Architecture and Design -Page 9 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -9.2. Staging -A client-request to create a file does not reach the Namenode immediately. In fact, the HDFS -client caches the file data into a temporary local file. An application-write is transparently -redirected to this temporary local file. When the local file accumulates data worth over a -HDFS block size, the client contacts the Namenode. The Namenode inserts the file name into -the file system hierarchy and allocates a data block for it. The Namenode responds to the -client request with the identity of the Datanode(s) and the destination data block. The client -flushes the block of data from the local temporary file to the specified Datanode. When a file -is closed, the remaining un-flushed data in the temporary local file is transferred to the -Datanode. The client then instructs the Namenode that the file is closed. At this point, the -Namenode commits the file creation operation into a persistent store. If the Namenode dies -before the file is closed, the file is lost. -The above approach has been adopted after careful consideration of target applications that -run on HDFS. Applications need streaming writes to files. If a client writes to a remote file -directly without any client side buffering, the network speed and the congestion in the -network impacts throughput considerably. This approach is not without precedence either. -Earlier distributed file system, e.g. AFS have used client side caching to improve -performance. A POSIX requirement has been relaxed to achieve higher performance of data -uploads. -9.3. Pipelining -When a client is writing data to a HDFS file, its data is first written to a local file as -explained above. Suppose the HDFS file has a replication factor of three. When the local file -accumulates a block of user data, the client retrieves a list of Datanodes from the Namenode. -This list represents the Datanodes that will host a replica of that block. The client then -flushes the data block to the first Datanode. The first Datanode starts receiving the data in -small portions (4 KB), writes each portion to its local repository and transfers that portion to -the second Datanode in the list. The second Datanode, in turn, starts receiving each portion of -the data block, writes that portion to its repository and then flushes that portion to the third -Datanode. The third Datanode writes the data to its local repository. A Datanode could be -receiving data from the previous one in the pipeline and at the same time it could be -forwarding data to the next one in the pipeline. Thus, the data is pipelined from one Datanode -to the next. -10. Accessibility -HDFS can be accessed by application by many different ways. Natively, HDFS provides a -The Hadoop Distributed File System: Architecture and Design -Page 10 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -Java API for applications to use. A C language wrapper for this Java API is available. A -HTTP browser can also be used to browse the file in HDFS. Work is in progress to expose a -HDFS content repository through the WebDAV Protocol. -10.1. DFSShell -HDFS allows user data to be organized in the form of files and directories. It provides an -interface called DFSShell that lets a user interact with the data in HDFS. The syntax of this -command set is similar to other shells (e.g. bash, csh) that users are already familiar with. -Here are some sample commands: -Create a directory named /foodir : hadoop dfs -mkdir /foodir -View a file /foodir/myfile.txt : hadoop dfs -cat /foodir/myfile.txt -Delete a file /foodir/myfile.txt : hadoop dfs -rm /foodir myfile.txt -The command syntax for DFSShell is targeted for applications that need a scripting language -to interact with the stored data. -10.2. DFSAdmin -The DFSAdmin command set is used for administering a dfs cluster. These are commands -that are used only by a HDFS administrator. Here are some sample commands: -Put a cluster in Safe Mode : bin/hadoop dfsadmin -safemode enter -Generate a list of Datanodes : bin/hadoop dfsadmin -report -Decommission a Datanode : bin/hadoop dfsadmin -decommission datanodename -10.3. Browser Interface -A typical HDFS install configures a web-server to expose the HDFS namespace through a -configurable port. This allows a Web browser to navigate the HDFS namespace and view -contents of a HDFS file. -11. Space Reclamation -11.1. File Deletes and Undelete -When a file is deleted by a user or an application, it is not immediately removed from HDFS. -HDFS renames it to a file in the /trash directory. The file can be restored quickly as long as it -The Hadoop Distributed File System: Architecture and Design -Page 11 -Copyright © 2005 The Apache Software Foundation. All rights reserved. -remains in /trash. A file remains in /trash for a configurable amount of time. After the expiry -of its life in /trash, the Namenode deletes the file from the HDFS namespace. The deletion of -the file causes the blocks associated with the file to be freed. There could be an appreciable -time delay between the time a file is deleted by a user and the time of the corresponding -increase in free space in HDFS. -A user can Undelete a file after deleting it as long as it remains in the /trash directory. If a -user wants to undelete a file that he/she has deleted, he/she can navigate the /trash directory -and retrieve the file. The /trash directory contains only the latest copy of the file that was -deleted. The /trash directory is just like any other directory with one special feature: HDFS -applies specified policies to automatically delete files from this directory. The current default -policy is to delete files that are older than 6 hours. In future, this policy will be configurable -through a well defined interface. -11.2. Decrease Replication Factor -When the replication factor of a file is reduced, the Namenode selects excess replicas that can -be deleted. The next Heartbeat transfers this information to the Datanode. The Datanode then -removes the corresponding blocks and the corresponding free space appears in the cluster. -The point to note here is that there might be a time delay between the completion of the -setReplication API and the appearance of free space in the cluster. -12. References -Browse the HDFS Java Interface -Download the HDFS source code -The Hadoop Distributed File System: Architecture and Design -Page 12 -Copyright © 2005 The Apache Software Foundation. All rights reserved. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls deleted file mode 100644 index 9ca46f8638cdce70e740b411bfe7493707d7fadc..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt deleted file mode 100644 index 4bc48d661c1f2e305c392a5574aa7ba8f56ceade..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt +++ /dev/null @@ -1,635 +0,0 @@ -Scalable HDFS Architecture Farag Azzedin Information&corporation&依赖 -Computer Science Department King Fahd University&petroleum and minerals dhahran&AGGREGATION -Scalable HDFS Architecture Farag Azzedin Information&cost&依赖 -One&area&AGGREGATION -Apache Hadoop&large scale data processing project&依赖 -Apache Hadoop&Hadoop&GENERALIZATION -one&data-intensive distributed application&依赖 -one&large scale data processing project&AGGREGATION -Hadoop application&distributed file system&依赖 -Hadoop application&data storage&依赖 -data storage&storage&GENERALIZATION -Hadoop application&file system ( hdf )&依赖 -master node&node&GENERALIZATION -its&RAM& -HDFS architecture&architecture&GENERALIZATION -metada&storage node&AGGREGATION -HDFS architecture&single master node&依赖 -HDFS architecture&single master node&依赖 -NameNode&Datanodes&依赖 -HDFS&NameNode& -HDFS Datanodes ’ metada&’s single-point-of-failure namenode&依赖 -capacity&RAM&AGGREGATION -HDFS Datanodes ’ metada&RAM&依赖 -HDFS Datanodes ’ metada&capacity&依赖 -RAM&’s single-point-of-failure namenode&AGGREGATION -paper&fault tolerant , highly available and widely scalable HDFS architecture&依赖 -drawback¤t HDFS architecture&AGGREGATION -this motivated researcher&this motivated researcher&依赖 -this motivated researcher&mapreduce&依赖 -this motivated researcher&system&依赖 -Google&MapReduce& -Apache Hadoop&cloud computing project&依赖 -reliable and scalable datum intensive distribute computing [ 2 , 3 , 4 ]&aiming&依赖 -one&cloud computing project&AGGREGATION -reliable and scalable datum intensive distribute computing [ 2 , 3 , 4 ]&Java&依赖 -its&applications& -HDFS implementation&even thousand&依赖 -HDFS implementation&server machine&依赖 -even thousand&server machine&AGGREGATION -system&data& -part&’s datum&AGGREGATION -HDFS implementation&implementation&GENERALIZATION -thus high probability&hardware failure&AGGREGATION -more server machine&more hardware&依赖 -component&hdf&AGGREGATION -faults detection&hdf [ 3 , 7 ]&依赖 -fundamental architectural goal&hdf [ 3 , 7 ]&AGGREGATION -same&HDFS cluster&依赖 -same&NameNode server&依赖 -HDFS cluster&cluster&GENERALIZATION -HDFS&performance& -availability&single NameNode machine&AGGREGATION -automatic restart and failover&NameNode software&AGGREGATION -Hadoop applications utilize HDFS [ 7 ]&Datanodes&依赖 -Hadoop applications utilize HDFS [ 7 ]&RAM [ 7 ]&依赖 -Hadoop applications utilize HDFS [ 7 ]&single master node&依赖 -Hadoop applications utilize HDFS [ 7 ]&NameNode&依赖 -its&]& -paper&NameNode&依赖 -case&single NameNode failure&AGGREGATION -HDFS NameNode&NameNode&GENERALIZATION -Several research project&Chord&依赖 -Several research project&basis&依赖 -Several research project&research&依赖 -their&research& -chord file system ( cfs ) store file&peer-to-peer system&依赖 -chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 -chord file system ( cfs ) store file&peer-to-peer system&依赖 -chord file system ( cfs ) store file&peer-to-peer system&依赖 -chord file system ( cfs ) store file&peer-to-peer system&依赖 -chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 -chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 -chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 -Chord&algorithms& -value&[ 9 ]&依赖 -set&special root server&AGGREGATION -ordinary dn&set&依赖 -Chord-based dn&special server&依赖 -ordinary dn&special root server&依赖 -route information ( n record&name server hierarchy&依赖 -correctness&analogous route information [ 9 ]&AGGREGATION -dn&route information ( n record&依赖 -Chord&analogous route information [ 9 ]&依赖 -dn&manual management&依赖 -route information ( n record&client&依赖 -Chord&correctness&依赖 -manual management&route information ( n record&AGGREGATION -Chord&no name structure [ 9 ]&依赖 -dn&named hosts or service&依赖 -dn&task&依赖 -rest&follow&依赖 -rest&paper&AGGREGATION -Hadoop architecture&Section II&依赖 -Hadoop architecture&architecture&GENERALIZATION -Section IV&problem statement&依赖 -we¤t Hadoop architecture&依赖 -we¤t Hadoop architecture&依赖 -we&issue&依赖 -we¤t Hadoop architecture&依赖 -we&issue&依赖 -its&NameNode& -our&motivation& -we&issue&依赖 -area&future work&AGGREGATION -We&Section vius&依赖 -HADOOP ARCHITECTURE Hadoop&several sub-project&依赖 -MapReduce&4 ]&依赖 -HADOOP ARCHITECTURE Hadoop&hadoop common hdfs&依赖 -MapReduce&4 ]&依赖 -this section briefly&sub-project&依赖 -this section briefly&Hadoop namely Hadoop Common&依赖 -sub-project&Hadoop namely Hadoop Common&AGGREGATION -Hadoop Common&Filesystem&依赖 -contribution area&3 ]&依赖 -contribution area&other Hadoop community project&依赖 -its&storage& -Kosmos Distributed File System )&[ 3 ]&依赖 -Kosmos Distributed File System )&[ 3 ]&依赖 -ten&petabyte&AGGREGATION -petabyte&storage&AGGREGATION -hdf&OS [ 3 ]&依赖 -hdf&top&依赖 -filesystem&OS [ 3 ]&AGGREGATION -hdf&filesystem&依赖 -top&filesystem&AGGREGATION -hdf&Java language [ 3 ]&依赖 -master/slave architecture&architecture&GENERALIZATION -hdf&master/slave architecture&依赖 -typical Hadoop cluster&NameNode&依赖 -NameNode&HDFS namespace&依赖 -Datanodes&actual datum&依赖 -machine&GNU/Linux OS&依赖 -machine&Java&依赖 -hdf&machine&依赖 -Usage&portable and all pervasive Java language&AGGREGATION -wide range&machine&AGGREGATION -dedicated machine&machine&GENERALIZATION -dedicated machine&NameNode software&依赖 -typical deployment&dedicated machine&依赖 -one instance&Datanode software&AGGREGATION -architecture&multiple datanode&依赖 -MapReduce&huge data set&依赖 -MapReduce&its simplicity and functionality [&依赖 -MapReduce&7 ]&实现 -huge data set&distributed application&AGGREGATION -its&simplicity& -MapReduce&distributed application&依赖 -integral part&Hadoop&AGGREGATION -It&large data set&依赖 -It&distributed computing&依赖 -cluster&computer&AGGREGATION -It&computer&依赖 -It&cluster&依赖 -MapReduce&datum&依赖 -master node&major input&依赖 -master node&typical ― Map ‖ function&依赖 -worker node&process&依赖 -worker node&node&GENERALIZATION -worker node&received problem chunk&依赖 -master node&" Reduce " function&依赖 -master node&processed sub-problem&依赖 -part&Hadoop project&AGGREGATION -it&map and reduction operation&依赖 -it&unnoticed distributed processing&依赖 -unnoticed distributed processing&map and reduction operation&AGGREGATION -multiple map function¶llel&依赖 -number&CPUs&AGGREGATION -output&same reducer&依赖 -map operation&operation&GENERALIZATION -map operation&same key&依赖 -set&' reducer&AGGREGATION -output&map operation&AGGREGATION -MapReduce&larger dataset&依赖 -MapReduce&handle&依赖 -petabyte&datum&AGGREGATION -parallelism&high availability&依赖 -parallelism&probability&依赖 -parallelism&probability&依赖 -parallelism&high availability&依赖 -case&partial failure&AGGREGATION -parallelism&probability&依赖 -probability&high availability&AGGREGATION -partial failure&servers or storage&AGGREGATION -parallelism&high availability&依赖 -parallelism&high availability&依赖 -parallelism&probability&依赖 -parallelism&high availability&依赖 -parallelism&probability&依赖 -rack name&worker node&AGGREGATION -rack name&network switch&AGGREGATION -rack name&[ 3 ]&依赖 -information&Hadoop application&依赖 -information&command&依赖 -HDFS filesystem&datum&依赖 -they&information&依赖 -HDFS filesystem&filesystem&GENERALIZATION -case&rack power or switch failure&AGGREGATION -hdf&reliable and extremely fast computations [ 5 ]&依赖 -hdf&numerous data blocks replica&依赖 -hdf&a cluster&依赖 -hdf&communication and client&依赖 -communication and client&RPC&依赖 -hdf&TCP/IP layer&依赖 -hdf&64 mb )&依赖 -ideal file size&64 mb )&依赖 -multiple&64 mb )&AGGREGATION -hdf&multiple&依赖 -hdf&large file&依赖 -datum&three node&依赖 -datum&default replication value&依赖 -replication&[&AGGREGATION -Data node&datum&依赖 -Figure 1&hdf&依赖 -client&single NameNode machine&依赖 -client&file metada or file modification&依赖 -NameNode and Datanodes&built-in webservers [ 6 ]&依赖 -current status&cluster&AGGREGATION -their&]& -NameNode&HDFS metadata [ 7 ]&依赖 -system&flows&依赖 -system&such a way&依赖 -user datum&NameNode [ 7 ]&依赖 -hdfs architecture&work&依赖 -[ 7 ] NameNode&Datanode&依赖 -[ 7 ] NameNode&periodical heartbeat message&依赖 -case&network partition&AGGREGATION -subset&Datanodes&AGGREGATION -datanode&recent heartbeat&依赖 -Datanode death&block&依赖 -Datanode death&replication factor&依赖 -replication factor&block&AGGREGATION -their&value& -NameNode&which&依赖 -replication factor&file&AGGREGATION -hdf&HDFS namespace&依赖 -set&Datanodes&AGGREGATION -file&one or more block&依赖 -block&set&依赖 -block&Datanodes&依赖 -reference&block&AGGREGATION -NameNode&block&依赖 -NameNode&reference&依赖 -NameNode&reference&依赖 -NameNode&block&依赖 -NameNode&HDFS namespace operation&依赖 -NameNode&block&依赖 -mapping&block&AGGREGATION -NameNode&mapping&依赖 -NameNode&mapping&依赖 -NameNode&block&依赖 -system&clients& -Datanodes&block creation&依赖 -NameNode&HDFS namespace&依赖 -NameNode&Edit Log&依赖 -modification&place&依赖 -modification&file system metada&依赖 -transaction log&log&GENERALIZATION -NameNode&transaction log&依赖 -NameNode&file&依赖 -NameNode&local host OS file system&依赖 -its&system& -entire file system namespace&file&依赖 -NameNode&system& -Silage&’s local file system&依赖 -Silage&[ 7 ]&依赖 -Silage&file&依赖 -NameNode&memory& -image&entire file system namespace and file Block map&AGGREGATION -image&’s system memory ( ram )&依赖 -large number&files and directory&AGGREGATION -4GB&RAM&AGGREGATION -it&silage and edit log&依赖 -it&disk&依赖 -in-memory representation&Silage&AGGREGATION -It&old EditLog&依赖 -transaction&persistent FsImage&依赖 -its&transactions& -procedure&checkpoint&依赖 -NameNode&up [ 7 ]&依赖 -checkpoint¤t implementation&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&file&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&datanode store hdfs datum&依赖 -datanode store hdfs datum&local file system&依赖 -datanode store hdfs datum&file&依赖 -Datanode&knowledge&依赖 -Datanode&HDFS file&依赖 -It&HDFS datum&依赖 -It&block&依赖 -It&HDFS datum&依赖 -block&HDFS datum&AGGREGATION -It&block&依赖 -It&block&依赖 -It&HDFS datum&依赖 -Datanode&file&依赖 -Datanode&same directory&依赖 -optimal number&file&AGGREGATION -it&heuristic&依赖 -It&local file&依赖 -local file system&single directory&依赖 -local file system&huge number&依赖 -huge number&file&AGGREGATION -It&same directory&依赖 -list&HDFS data block&AGGREGATION -it&local file system&依赖 -NameNode&file&依赖 -NameNode&client request&依赖 -HDFS client&file datum&依赖 -HDFS client&temporary local file&依赖 -HDFS client&reality&依赖 -HDFS client&client&GENERALIZATION -Application write&temporary local file&依赖 -client contact&NameNode&依赖 -local file&datum worth&依赖 -local file&one HDFS block size&依赖 -client contact&NameNode&依赖 -client contact&NameNode&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -namenode insert&file name&依赖 -namenode insert&file system hierarchy&依赖 -NameNode&Datanode&依赖 -NameNode&identity&依赖 -identity&Datanode&AGGREGATION -NameNode&client request&依赖 -client&datum&依赖 -client&block&依赖 -client&datum&依赖 -block&datum&AGGREGATION -client&datum&依赖 -client&block&依赖 -client&block&依赖 -un-flushed datum&Datanode&依赖 -client&NameNode&依赖 -NameNode&persistent store&依赖 -NameNode&point&依赖 -NameNode&file creation operation&依赖 -its&store& -PROBLEM STATEMENT AND MOTIVATION&architecture&实现 -PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 -PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 -architecture&Hadoop [ 7 ]&AGGREGATION -PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 -PROBLEM STATEMENT AND MOTIVATION&architecture&实现 -PROBLEM STATEMENT AND MOTIVATION&architecture&实现 -PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 -usage&single NameNode machine&AGGREGATION -PROBLEM STATEMENT AND MOTIVATION&architecture&实现 -simplicity&cost , namely scalability and high availability issue&依赖 -maximum number&Datanodes&AGGREGATION -hdf&one distinctive manager/controller server machine&依赖 -server machine&machine&GENERALIZATION -hdf&NameNode&依赖 -it&outstanding client request&依赖 -Datanode&operation&依赖 -NameNode server restoration process&hour&依赖 -hdf&Secondary Namenode&依赖 -NameNode&information& -directory information&information&GENERALIZATION -Secondary NameNode function&directory information&依赖 -periodic image-based snapshot&directory information&AGGREGATION -Secondary NameNode function&periodic image-based snapshot&依赖 -edit log&an up-to-date directory structure [ 3 ]&依赖 -entire journal&HDFS action&AGGREGATION -above-mentioned issue&make&依赖 -above-mentioned issue&us&依赖 -above-mentioned issue&us&依赖 -above-mentioned issue&a way&依赖 -above-mentioned issue&a way&依赖 -above-mentioned issue&make&依赖 -paper&problem&依赖 -paper&solution&依赖 -it&node [ 9 ]&依赖 -it&key&依赖 -it&unique key&依赖 -v. chord protocol functional&chord protocol&AGGREGATION -Chord&[ 10 ]&依赖 -node&equal number&依赖 -consistent hash&load balancing&依赖 -node&key&依赖 -less reallocation&key&AGGREGATION -node&system [ 9 ]&依赖 -equal number&key&AGGREGATION -chord protocol address&fundamental issue&依赖 -node&a cluster include load balancing&依赖 -chord protocol address&fundamental issue&依赖 -It&load balancing&依赖 -hash function&function&GENERALIZATION -large cluster&node&AGGREGATION -chord lookup cost increase&log&依赖 -log&number&AGGREGATION -chord lookup cost increase&node&依赖 -chord lookup cost increase&log&依赖 -chord lookup cost increase&number&依赖 -chord lookup cost increase&node&依赖 -number&node&AGGREGATION -chord lookup cost increase&log&依赖 -chord lookup cost increase&number&依赖 -chord lookup cost increase&node&依赖 -chord lookup cost increase&node&依赖 -chord lookup cost increase&chord lookup cost increase&依赖 -chord lookup cost increase&number&依赖 -chord lookup cost increase&number&依赖 -chord lookup cost increase&log&依赖 -chord lookup cost increase&chord lookup cost increase&依赖 -chord lookup cost increase&chord lookup cost increase&依赖 -chord lookup cost increase&chord lookup cost increase&依赖 -additional parameter tuning&scaling&依赖 -its&tables& -Chord&high availability&依赖 -fault tolerant cluster&node&AGGREGATION -system&change [ 9 ]&依赖 -continuous state&change [ 9 ]&AGGREGATION -application&it&依赖 -fail safe nature&Chord software&AGGREGATION -form&library&AGGREGATION -fail safe nature&form&依赖 -fail safe nature&library&依赖 -system&Chord&依赖 -system&two fold&依赖 -system&two fold&依赖 -system&Chord&依赖 -function&node&依赖 -application&Chord library&依赖 -function&IP address&依赖 -application&function&依赖 -IP address&node&AGGREGATION -Chord library&library&GENERALIZATION -set&key&AGGREGATION -application&node&依赖 -application&Chord software&依赖 -new node&cluster [ 9 ]&依赖 -update&application software&依赖 -their&nodes& -update&respective value&依赖 -user-friendly naming&datum&AGGREGATION -requirements implementation&implementation&GENERALIZATION -flat key-space feature&Chord&AGGREGATION -flat key-space feature&requirements implementation&实现 -cryptographic hash&datum&AGGREGATION -application&datum&依赖 -application&Chord key&依赖 -Chord key&key&GENERALIZATION -application&data replication&依赖 -data&identifier& -several content provider&other ’s datum&依赖 -set&software development project&AGGREGATION -node&software development project&依赖 -everyone&periodic release&依赖 -node&load& -aggregate cost&cluster&AGGREGATION -implementation&Chord&依赖 -implementation&map data block&依赖 -Dabek et al. [ 12 ]&implementation&依赖 -Dabek et al. [ 12 ]&concept&实现 -implementation&concept&AGGREGATION -implementation&map data block&依赖 -implementation&Chord&依赖 -Chord&load balance&依赖 -Chord&application&依赖 -their&data& -they&return&依赖 -they&having&依赖 -node&’s datum&依赖 -node&data& -data&name& -Several similar problem&cooperative mirroring application&依赖 -goal&high availability&依赖 -Our&aim& -Our&architecture& -proposed architecture&NameNode Clustered chord ( nucu )&依赖 -client&single NameNode machine&依赖 -client&single NameNode machine&依赖 -client&file metadata or file modification&依赖 -client&file metadata or file modification&依赖 -client&single NameNode machine&依赖 -client&file metadata or file modification&依赖 -client&file metadata or file modification&依赖 -client&single NameNode machine&依赖 -We&client request&依赖 -We&resource request&依赖 -resource request&key&依赖 -resource request&consistent hashing algorithm&依赖 -NameNode&resource request reply ( rrp )&依赖 -NameNode&client resource request&依赖 -client resource request&resource request&GENERALIZATION -RRQ&NCUC black-box&依赖 -client&respective Datanodes&依赖 -client&respective Datanodes&依赖 -workflow&Figure 2&依赖 -NCUC&NCUC NameNodes&依赖 -NCUC&following way&依赖 -NCUC&key&依赖 -identifier&NCUC identifier NameNode ring modulo 2k&依赖 -whose&identifier& -identifier&z&AGGREGATION -NCUC&a k-bit identifier use sha-1 [ 16 ]&依赖 -NCUC&NameNode&依赖 -NCUC&consistent hash function&依赖 -its&key& -NameNode&identifier& -NameNode&address& -NameNode&successor NameNode&依赖 -successor NameNode&key z , or succ ( z )&AGGREGATION -successor NameNode&NameNode&GENERALIZATION -NCUC black-box&five key&依赖 -NameNode&key z , or succ ( z )&依赖 -NCUC black-box&ten NameNodes&依赖 -circle&number&AGGREGATION -Figure3&NCUC ring&依赖 -NCUC ring&ten NameNodes&依赖 -NCUC ring&ten NameNodes&依赖 -so key 10&NameNode 14&依赖 -successor&identifier 10 , succ ( 10 )&AGGREGATION -key 24&NameNode 32&依赖 -namenodes join&NCUC cluster&依赖 -namenodes join&NCUC cluster&依赖 -namenodes join&little interruption&依赖 -namenodes join&little interruption&依赖 -NCUC cluster&cluster&GENERALIZATION -n&successor& -NameNode n&NCUC clustered ring&依赖 -n&departure& -n&keys& -further change&keys allocation&依赖 -further change&NCUC namenode&依赖 -it&identifier 24&依赖 -it&key&依赖 -it&identifier 32&依赖 -it&key&依赖 -it&NameNode&依赖 -quick distributed calculation&hash function&AGGREGATION -NCUC&quick distributed calculation&依赖 -NCUC&hash function&依赖 -NCUC map&consistent hash [ 14 , 15 ]&依赖 -Nth NameNode&NCUC cluster&依赖 -NCUC hash function&hash function&GENERALIZATION -Nth NameNode&NameNode&GENERALIZATION -NCUC hash function&load balancing&依赖 -NameNodes approximately equal number&key&AGGREGATION -merely o ( 1/n ) portion&key&AGGREGATION -NameNode&table&依赖 -Workflow&ncuc architecture begin client resource requests – hash&AGGREGATION -NameNode&o ( logn&依赖 -lookup&) message&依赖 -lookup&o ( log n&依赖 -usage&Chord protocol [ 9 ]&AGGREGATION -one&primary goal&AGGREGATION -It&single HDFS NameNode architecture&实现 -simplicity&single HDFS NameNode architecture&AGGREGATION -It&simplicity&依赖 -primary goal&simple HDFS architecture&AGGREGATION -its&limitation& -single point-of-failure HDFS NameNode&alternative solution&依赖 -our&implementation& -we&performance analysis first&依赖 -we&Chord&依赖 -we&set&依赖 -we&experiment&依赖 -our&architecture& -set&experiment&AGGREGATION -we&2 Linux Amazon Cloud EC2 node&依赖 -We¤t HDFS architecture&依赖 -Table 1¤t HDFS architecture&依赖 -size 512 MB&512 MB&GENERALIZATION -Table 1&result&依赖 -single file&size 512 MB&AGGREGATION -Table 1&single file&依赖 -Table 2&size 512 MB&依赖 -Table 2&size 512 MB&依赖 -Table 2&result&依赖 -Table 2&result&依赖 -Table 2&single file&依赖 -Table 2&single file&依赖 -512 MB&nrfile = 5 , replication = 1 op&依赖 -512 MB&nrfile = 5 , replication = 1 op&依赖 -I/O rate ( mb/s&0 0 0 0&依赖 -term&I/O rate&AGGREGATION -term&throughput&AGGREGATION -I/O rate ( mb/s&0 0 0 0&依赖 -IaaS , PaaS and saa&cost&依赖 -Hadoop application&data storage&依赖 -Hadoop application&primary distributed file system&依赖 -whose&NameNode& -proposed architecture&single-point-of-failure&依赖 -availability and scalability&HDFS architecture&AGGREGATION -its&single-point-of-failure& -proposed architecture&availability and scalability&依赖 -proposed architecture&HDFS architecture&依赖 -little complexity&approach&依赖 -little complexity&HDFS NameNode&依赖 -we&extensive experiment&依赖 -result&future extensive evaluation process&AGGREGATION -our&process& -acknowledgment author&support&依赖 -King Fahd University KFUPM&Petroleum and Minerals&AGGREGATION -project&number&GENERALIZATION -project&King Abdulaziz City kacst )&依赖 -project&Technology&依赖 -Berkeley View&Cloud Computing&AGGREGATION -technical report eecs-2009-28 and http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html and Feb.&technical report eecs-2009-28 and http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html and Feb.&依赖 -Theory&Computing&AGGREGATION -― Serving dn&Chord&依赖 -[ 12 ] F. Dabek&Proc&依赖 -― Analysis&evolution&AGGREGATION -evolution&peer-to-peer system&AGGREGATION -13 ] D. Liben-Nowell&Proc&依赖 -principle distribute computing ( podc ) and CA , July 2002 , pp.&distribute computing ( podc )&AGGREGATION -protocol&Proc&依赖 -protocol&Proc&依赖 -protocol&Proc&依赖 -Master&thesis& -Department&Electric&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt b/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt deleted file mode 100644 index 22c07123044141d801714c213d49334f7f0c1328..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt +++ /dev/null @@ -1,205 +0,0 @@ -Towards A Scalable HDFS Architecture -Farag Azzedin -Information and Computer Science Department -King Fahd University of Petroleum and Minerals -Dhahran, 31261, Saudi Arabia -fazzedin@kfupm.edu.sa -Abstract—Cloud computing infrastructures allow corporations to reduce costs by outsourcing computations on-demand. One of the areas cloud computing is increasingly being utilized for is large scale data processing. Apache Hadoop is one of these large scale data processing projects that supports data-intensive distributed applications. Hadoop applications utilize a distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS architecture, by design, has only a single master node called NameNode, which manages and maintains the metadata of storage nodes, called Datanodes, in its RAM. Hence, HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture. The proposed architecture provides a distributed NameNode space eliminating the drawbacks of the current HDFS architecture. This is achieved by integrating the Chord protocol into the HDFS architecture. -Keywords-Cloud Computing Platform, Hadoop, HDFS, Chord, Distributed NameNode -I. INTRODUCTION AND RELATED WORK -Cloud computing environments can provide large-scale datacenters at reduced cost by using or integrating the service models namely software as a service (SaaS), platform as a service (PaaS) and infrastructure as a service (IaaS). This results in the increased income for cloud computing service providers and decrease costs for cloud users [1, 2]. This motivated researchers to introduce systems such as Google's MapReduce, Google File System (GFS) and Hadoop [3]. The Apache Hadoop is one of the cloud computing projects; built using Java aiming to develop open-source software for reliable and scalable data intensive distributed computing [2, 3, 4]. Yahoo!, which uses Hadoop in its applications [3], has been the major contributor to this project. -Hadoop applications utilize a distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS implementation may contain hundreds or even thousands of server machines, each storing some part of the file system’s data. This opens the door for more hardware failures because more server machines mean more hardware and thus high probability of hardware failures. This makes it inevitable that some component of HDFS is always non-functional. Thus, faults detection, alarms, and automatic prompt server recovery are fundamental architectural goals of HDFS [3, 7]. The same applies for a NameNode server in an HDFS cluster. HDFS’s performance heavily relies on the availability of single NameNode machine. Currently, automatic restart and failover of the NameNode software to another machine is not supported [3, 7]. -Hadoop applications utilize HDFS [7], which has only a single master node called NameNode, which manages and maintains the metadata of storage nodes, called Datanodes, in its RAM [7]. Hence, HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture having a NameNode which is distributed and will not suffer HDFS failure in case of a single NameNode failure. This is achieved by using Chord protocol to introduce clustering in HDFS NameNode. -Several research projects have used Chord as a basis for their research. In a peer-to-peer system, the Chord File System (CFS) stores files and metadata and uses Chord to locate storage blocks [12]. Several investigation methods have proved that Chord’s stabilization algorithms with fewer changes provide decent lookup performance, regardless of constant failure and joining of nodes [13, 17]. DNS provides a lookup service, with host names as keys and IP addresses (and other host information) as values [9]. Chord could provide the same service by hashing each host name to a key [11]. Chord-based DNS would require no special servers, while ordinary DNS relies on a set of special root servers. DNS requires manual management of the routing information (NS records) that allows clients to navigate the name server hierarchy; Chord automatically maintains the correctness of the analogous routing information [9]. DNS only works well when host names are structured to reflect administrative boundaries; Chord imposes no naming structure [9]. DNS is specialized to the task of finding named hosts or services, while Chord can also be used to find data objects that are not tied to particular machines [9]. -The rest of the paper is organized as follows. Hadoop architecture is briefly introduced in Section II. This is followed by Section III explaining how Hadoop and HDFS work. Next, we discuss the issues in current Hadoop architecture with respect to its NameNode, while Section IV outlines the problem statement and our motivation to propose a modification to the existing HDFS architecture. We also 978-1-4673-6404-1/13/$31.00 ©2013 IEEE 155 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -discuss the Chord architecture in Sections V. Section VI presents the proposed HDFS NameNode solution and architecture. We conclude and specify areas of future work in Section VII. -II. EXISTING HADOOP ARCHITECTURE -Hadoop includes several sub-projects namely Hadoop Common HDFS, and MapReduce besides several others [4]. This section briefly discusses the sub-projects of Hadoop namely Hadoop Common, HDFS and MapReduce. -Hadoop Common: Hadoop Common provides the common utilities to support the other Hadoop subprojects and enables entry point to the Hadoop filesystems [3, 4]. Hadoop Common contains Filesystem, RPC, serialization libraries, some required jar files and scripts to initiate Hadoop [3]. Also, Hadoop Common provides source code, documents, and a contribution area that contains other Hadoop community projects [3]. -HDFS: Although Hadoop supports other storage filesystems to store datasets including Amazon S3 filesystem, CloudStore (a.k.a. Kosmos Distributed File System), FTP filesystem, and Read-only HTTP and HTTPS filesystems [3], HDFS is its primary storage and rack-aware filesystem, that is used by its applications [3, 4]. HDFS is designed to work with large data sets requiring tens of petabytes of storage. HDFS operates on top of the filesystems of the underlying OS [3]. HDFS is written in Java language [3]. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware [7]. -HDFS has master/slave architecture. A typical Hadoop cluster is mainly comprised of a NameNode and several Datanode machines. The NameNode manages the HDFS namespace and regulates access to files that are requested by clients [7]. Datanodes, which manage storage attached to the nodes that they run on, store the actual data [6, 7]. -The NameNode and Datanode are software programs designed to run on everyday use machines. These machines typically run on a GNU/Linux OS. HDFS can be run on any machine that supports Java and therefore can run either a NameNode or the Datanode software. Usage of the highly portable and all pervasive Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the Datanode software. The architecture does not prevent running multiple Datanodes on the same machine but, in practice, that is rarely the case [7]. -MapReduce: For huge data sets of distributed applications, MapReduce is well known for its simplicity and functionality [7]. It serves as an integral part of Hadoop to support distributed computing on large data sets on clusters of computers [8]. MapReduce can be applied on the data stored in either a filesystem (unstructured) or within a database (structured) [8]. During a typical ―Map‖ function, the master node accepts a major input, slices it into several minor sub-problems, and allocates them to worker nodes [8]. A worker node could repeat this process again, if needed, resulting in a multi-level tree structure [8]. Finally, the worker node processes the received problem chunk, and returns the processed data back to its master node [8]. In the "Reduce" function, the master node receives the processed sub-problems and aggregates them in some way to form the output [8]. -MapReduce has been wisely chosen to be the part of Hadoop project because it enables unnoticed distributed processing of the map and reduction operations. Hence, multiple map functions can be run in parallel, given that, each mapping operation is autonomous of the other. In reality, however, this condition is limited by the data source and/or the number of CPUs near that data. Likewise, a set of 'reducers' can be run all at the same time during the reduction phase, given that all outputs of the map operation, which share the same key, are presented to the same reducer simultaneously. Although, this procedure may look ineffective compared to other sequential algorithms, MapReduce can be functional to potentially larger datasets than "commodity" servers can handle. Hence, for instance, using MapReduce, a large server cluster can sort a petabyte of data in only a few hours. Moreover, this parallelism also enables a probability of high availability in case of a partial failure of servers or storage during the operation. That is, if one mapper or reducer fails, the work can be rescheduled, given that the input data is still available [8]. -III. HOW HADOOP AND HDFS WORK? -For productive work scheduling, usually the filesystems provide location-awareness i.e., rack name of a worker node, for instance, the rack name of the network switch [3]. This information helps Hadoop applications to execute commands on the corresponding compute nodes [3]. When the HDFS filesystem starts replicating data, they use this information to keep multiple data copies on redundant racks [3]. The overall objective here is to provide high data availability in case of rack power or switch failure [3]. HDFS generates numerous data blocks replicas and distributes them on Datanodes, throughout a cluster to accomplish reliable and extremely fast computations [5]. HDFS provides high throughput access to application data [7]. -HDFS uses the TCP/IP layer for communication and clients use RPC to communicate with each other. The HDFS can store large files (an ideal file size is a multiple of 64 MB), across multiple Datanode machines. HDFS accomplishes reliability by duplicating the data across multiple hosts, and hence does not require RAID storage on hosts. With the default replication value, 3, data is stored on three nodes: two on the same rack, and one on a different rack. Data nodes can talk to each other to rebalance data, to move copies around, and to keep the replication of data high [3]. -156 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -Figure 1 shows how Hadoop uses HDFS to function. Clients contact the single NameNode machine for file metadata or file modifications and perform actual file I/O directly with the Datanodes [6]. To easily find out the current status of a cluster, the NameNode and Datanodes use their built-in webservers [6]. The NameNode is the judge and source for all HDFS metadata [7]. The system is designed in such a way that user data never flows through the NameNode [7]. -Figure 1: How HDFS Architecture works? [7] -NameNode receives a periodical heartbeat message from each Datanode. The NameNode may get disconnected with a subset of Datanodes in case of a network partition. The NameNode declares dead to those Datanodes that do not send recent heartbeats and stops forwarding any new I/O requests to them. Data that was registered to those dead Datanodes becomes unavailable to HDFS as well. Datanode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication, whenever necessary. The requirement for re-replication may occur because of several reasons: a Datanode may become unavailable, a replica may become corrupted, a hard disk on a Datanode may fail, or the replication factor of a file may be increased [7]. -HDFS represents an HDFS namespace and enables user data to be stored in block-based file chunks over Datanodes. Specifically, a file is split into one or more blocks and these blocks are stored in a set of Datanodes. The NameNode also keeps reference of these blocks in a block pool. The NameNode executes HDFS namespace operations like opening, closing, and renaming files and directories. The NameNode also maintains and find outs the mapping of blocks to Datanodes. The Datanodes are accountable for performing read and write requests from the file system’s clients. The Datanodes also perform block creation, deletion, and replication upon receiving the instructions from the NameNode [7]. -The NameNode stores the HDFS namespace. The NameNode keeps a transaction log called the Edit Log to continuously record every modification that takes place in the file system metadata. For instance, creating a new file in HDFS causes an event that asks the NameNode to insert a record into the Edit Log representing this action. Likewise, changing the replication factor of a file causes another event that asks the NameNode to insert another new record to be inserted into the Edit Log representing this action. The NameNode uses a file in its local host OS file system to store the Edit Log. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the Silage. The Silage is stored as a file in the NameNode’s local file system too [7]. -The image of the entire file system namespace and file Block map is kept in the NameNode’s system memory (RAM). This key metadata item is designed to be compact, such that a NameNode with 4GB of RAM is plenty to support a large number of files and directories. When the NameNode starts up, it reads the Silage and Edit Log from disk, applies all the transactions from the Edit Log to the in-memory representation of the Silage, and flushes out this new version into a new Silage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This procedure is known as a checkpoint. In the current implementation, a checkpoint only occurs when the NameNode starts up [7]. -The Datanode stores HDFS data in files in its local file system. The Datanode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The Datanode does not create all files in the same directory. Instead, it uses heuristics to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a Datanode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files and sends this report to the NameNode: this is the Blockreport [7]. -The NameNode does not receive the client request to create a file. In reality, the HDFS client, initially, caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy and allocates a data block for it. The NameNode responds to the client request with the identity of the Datanode and the destination data block. Then the client flushes the block of data from the local temporary file to the specified Datanode. When a file is closed, the remaining un-flushed data in the temporary local file is transferred to the Datanode. The client then informs the NameNode that the file is closed. At this point, the NameNode commits the file creation operation into its persistent store. If the NameNode dies before the file is closed, the file is lost [7]. -157 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -IV. PROBLEM STATEMENT AND MOTIVATION -The usage of a single NameNode machine in a cluster greatly simplifies the architecture of Hadoop [7]. However, this simplicity comes at some cost, namely scalability and high availability issues. -Scalability Issue: An HDFS NameNode server RAM size limits the metadata and thus the maximum number of Datanodes, and/or relative transactional metadata, that are supported in an HDFS cluster [7]. -High Availability Issue: An HDFS needs one distinctive manager/controller server machine, the NameNode. This is a single point-of-failure for an HDFS implementation. Once this NameNode fails, the HDFS goes offline. After it gets back online, it must respond to all outstanding client requests and Datanode manage operations. The NameNode server restoration process can take over half an hour for a large cluster. The HDFS also includes a Secondary Namenode, which could be misleading to some people that after the Primary NameNode server fails, the Secondary NameNode gets active and takes over. In reality, the Secondary NameNode function is just there to build periodic image-based snapshots of the Primary NameNode's directory information and save them to local/remote directories. These image-based checkpoints can only be used to restart a failed Primary NameNode without having to replay the entire journal of HDFS actions, the edit log to create an up-to-date directory structure [3]. -The above-mentioned issues in HDFS NameNode architecture motivated us to find a way to make the NameNode seamlessly highly available, scalable and thus improving HDFS performance. Chord protocol and its applications can provide a reliable and efficient solution to these problems. This paper proposes a solution to these problems by modifying the existing HDFS NameNode architecture. -V. CHORD PROTOCOL -The functional of Chord protocol is primitive: for a unique key, it maps this key to a node [9]. This node, depending on the application using Chord, could be in charge for storing a corresponding value for its key [9]. Chord employs consistent hashing [10] to allocate keys to Chord nodes [9]. Because each node receives approximately the equal number of keys, consistent hashing performs load balancing and needs comparatively less reallocation of keys when nodes join and leave the system [9]. -Chord protocol addresses the fundamental issues authoritatively when a node joins or leaves a cluster including load balancing, scalability, and availability. It achieves load balancing by acting as a distributed hash function and assigning keys uniformly over the nodes. Chord is highly scalable and can work for very large cluster of nodes. Hence, Chord lookup cost increases with the log of the number of nodes. Additionally, no additional parameter tuning is necessary to accomplish this scaling. Chord achieves high availability by automatically adjusting its internal tables as the new nodes join and leave (due to failure, and/or maintenance etc.) the cluster. This ensures a highly fault tolerant cluster of nodes. Hence, the node responsible for a key can always be found irrespective of nodes joining or leaving a cluster or even if the system is in continuous state of change [9]. -The fail safe nature of Chord software takes the form of a library to be connected with the applications using it. The system containing the application interacts with the Chord in two folds. Initially, the application receives a function from the Chord library that contains the IP address of the node responsible for the key. Lastly, the application with modifications in the sets of keys for each responsible nodes gets notified from the Chord software on each node. For instance, this update enables the application software to shift the respective values to their new location nodes, when a new node joins or leaves the cluster [9]. -The required authentication, caching, duplication, and user-friendly naming of data are provided by the application using Chord. The flat key-space feature of the Chord simplifies these requirements implementation. For instance, for data authentication, an application may store the data using a Chord key that is derived using a cryptographic hash of this data. Likewise, the application may also achieve data replication by storing it using two separate Chord keys resulting from the data’s application-level identifier. Other instances where Chord can provide a good basis include Cooperative Mirroring, and Time-Shared Storage [9]. -During Cooperative Mirroring, several content providers work together to store and serve each other’s data. These nodes could be a set of software development projects, everyone making a periodic release. Distributing the accumulative work load uniformly over all nodes reduces the aggregate cost of the cluster, because every node will be required to deliver the capacity for the average load only and not the node’s highest load. Dabek et al. [12] explains an implementation of this concept that uses Chord to map data blocks onto servers. Chord interacts with the application to load balance, replicate data, and latency-based server selection [9]. -In Time-Shared Storage, the nodes, to achieve high availability, but have sporadic connectivity may offer to store other node’s data while they are connected, in return for having their data stored elsewhere when they are disconnected. To detect the Chord node which is alive and is acting as the data store at some point in time, the data’s name can be used as a key for it. Several similar problems occur just as in the cooperative mirroring application; however the goal here is to achieve high availability rather than load balancing [9]. -VI. PROPOSED ARCHITECTURE -Our aim is to enhance the HDFS architecture so that HDFS NameNode is highly available and scalable. This is 158 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -accomplished by integrating the Chord protocol into the existing HDFS NameNode architecture. Our proposed architecture is referred to as NameNode Clustered Using Chord (NUCU). -A. NCUC Architecture -With single HDFS NameNode in the existing architecture, clients request the single NameNode machine for file metadata or file modifications and perform the actual file I/O directly with the Datanodes [6]. We will refer to these client requests as resource requests. Each resource request will be hashed into a key using a consistent hashing algorithm to form a resource request query (RRQ). The NameNode responds to a client resource request with resource request reply (RRP). -NCUC architecture is simple. The RRQ will be passed to NCUC black-box, which would provide the client with the RRP. The clients, upon receiving the RRP, would contact the respective Datanodes, specified in RRP, to perform the desired I/O requests for the data chunks stored in Datanodes. This workflow is explained in Figure 2. NCUC, using consistent hashing, maps the keys to NCUC NameNodes in the following way. Identifiers are assigned on an NCUC identifier NameNode ring modulo 2k. The first node, whose identifier is equal to or tails the identifier of z in the identifier space, is mapped a key z. -B. Inside the NCUC Black-box: -The NCUC uses consistent hash function to map each NameNode and its key with a k-bit identifier using SHA-1 [16] as a base hash function. A NameNode’s identifier is selected by hashing the NameNode’s static IP address, whereas a key identifier is constructed by hashing the key. It should be noted that the identifier length k should be sufficiently large to avoid the probability of having one NameNode or key hashing to the other already assigned key or NameNode identifier. -Figure 3: NCUC black-box containing ten NameNodes storing five keys -This NameNode is referred as the successor NameNode of key z, or succ(z). Assuming that the identifiers are indicated as a circle of numbers from 0 to 2m — 1, then the succ(z) will be the first NameNode clockwise from z. This can be comprehended using the following example. Figure3, shows a NCUC ring with k = 6. The NCUC ring, storing five keys, has ten NameNodes. Here, the successor of identifier 10, succ(10), is NameNode 14, so key 10 would be located at NameNode 14. Likewise, keys 24 and 30 would be located at NameNode 32, key 38 at NameNode 38, and key 54 located at NameNode 56. -NCUC using consistent hashing is intended to enable NameNodes join and exit the NCUC cluster with little interruption. Some keys formerly allocated to n’s successor now would be reassigned to n in order to preserve the NCUC consistent hashing mapping when a NameNode n enters the NCUC clustered ring. All of NameNode n’s allocated keys will be reallocated to n’s successor upon n’s departure from the NCUC clustered ring. No further changes are required in the keys allocation to NCUC NameNodes. In Figure 3, considering that a NameNode were to enter the NCUC ring with identifier 26, it would get the key with identifier 24 from the NameNode with identifier 32. -NCUC offers quick distributed calculation of a hash function, assigning keys to NameNodes. NCUC maps keys to NameNodes using consistent hashing [14, 15]. The NCUC hash function, quite efficiently, does load balancing by allocating all NameNodes approximately equal number of keys and after an Nth NameNode enters or departs the NCUC cluster, merely O(1/N) portion of the keys are forwarded to the next relevant NameNode. -For a NCUC cluster with N NameNodes, each NameNode will preserve routing table merely regarding O(logN) other -Figure 2: Workflow of NCUC Architecture -BEGIN -CLIENT RESOURCE REQUESTS – HASH INTO RRQ -NCUC Black-box -NCUC RESPONSE – RRP – CLIENT ACKNOWLEDGES -CLIENT CONTACTS DATANODES -END -N51 -N56 -N1 -N8 -N14 -N21 -N32 -N38 -N42 -N48 -Z10 -Z34 -Z24 -Z38 -Z54 159 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -NameNodes, while the lookup will need O(log N) messages. This is achieved due to the usage of the Chord protocol [9]. -It is arguable that Chord integration into HDFS NameNode architecture impacts the simplicity of a single HDFS NameNode architecture, which is one of the primary goals of a simple HDFS architecture. However, the single point-of-failure HDFS NameNode and its RAM limitation to keep all the files metadata, stored in the Datanodes, required an alternative solution. Chord integration into HDFS NameNode provides a reliable and efficient solution to this problem. -VII. PERFORMANCE ANALYSIS -First, we developed Chord and validated our Chord implementation by reproducing all the experiments’ results conducted in [9]. Second, we performed a set of experiments to evaluate our proposed HDFS architecture. In evaluating our proposed HDFS architecture, we used 2 Linux Amazon Cloud EC2 nodes after installing Hadoop on these machines. We then configured and tested the current HDFS architecture as well as our proposed HDSF architecture by performing two operations, namely READ and WRITE. -Table 1 shows the results of writing and reading a single file of size 512 MB using the current HDFS architecture. On the other hand, Table 2 shows the results of writing and reading a single file of size 512 MB using our proposed HDFS architecture. -TABLE I. W/R RESULTS USING CURRENT HDFS ARCHITECTURE. 512 MB, Blocksize=64 MB, nrFiles=5, Replication=1 -Op. -Performance -Metrics -Exp.#1 -Exp. -#2 -Exp. -#3 -Ave. -W -Throughput (mb/s) -129 -134 -139 -134 -Ave. I/O rate (mb/s) -129 -134 -139 -134 -I/O rate std deviation -0 -0 -0 -0 -R -Throughput (mb/s) -150 -146 -152 -150 -Ave. I/O rate (mb/s) -151 -147 -153 -150 -I/O rate std deviation -13 -5 -13 -10 -TABLE II. W/R RESULTS USING PROPOSED HDFS ARCHITECTURE. -512 MB, Blocksize=64 MB, nrFiles=5, Replication=1 -Op. -Performance -Metrics -Exp.#1 -Exp. -#2 -Exp. -#3 -Ave. -W -Throughput (mb/s) -156 -151 -154 -154 -Ave. I/O rate (mb/s) -152 -149 -150 -150 -I/O rate std deviation -0 -0 -0 -0 -R -Throughput (mb/s) -180 -180 -174 -178 -Ave. I/O rate (mb/s) -176 -176 -176 -176 -I/O rate std deviation -0 -0 -0 -0 -It can be inferred from these results that our proposed architecture performed better in terms of throughput and in terms of I/O rate. -VIII. FUTURE WORK AND CONCLUSIONS -Cloud computing enable companies outsource IaaS, PaaS and SaaS on-demand to reduce costs. One of the areas cloud computing is increasingly being utilized for is large scale data processing. Apache Hadoop is one such attempt to support data-intensive distributed applications. Hadoop applications utilize a primary distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture whose NameNode is distributed and will not suffer HDFS failure in case of a single NameNode failure. We achieved this by utilizing the Chord protocol and integrate it with HDFS NameNode. Not only a little complexity is introduced into the HDFS NameNode by this approach, our proposed architecture will highly improve the availability and scalability of HDFS architecture including its single-point-of-failure. As a future work, we are planning to conduct extensive experiments and build a prototype as a result of our future extensive evaluation process. -ACKNOWLEDGMENT -The author acknowledges the support provided by King Fahd University of Petroleum and Minerals (KFUPM). This project is funded by King Abdulaziz City for Science and Technology (KACST) under the National Science, Technology, and Innovation Plan (project number 11-INF1657-04). -REFERENCES -[1] Michael Armbrust et al. Above the Clouds: A Berkeley View of Cloud Computing. Technical report EECS-2009-28, UC Berkeley, http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html, Feb. 2009. -[2] Amazon web services economics center. http://aws.amazon.com/economics/ . -[3] http://en.wikipedia.org/wiki/Hadoop -[4] http://hadoop.apache.org/#What+Is+Hadoop%3F -[5] http://hadoop.apache.org/hdfs/ -[6] http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html -[7] http://hadoop.apache.org/hdfs/docs/current/hdfs_design.html -[8] http://en.wikipedia.org/wiki/MapReduce -[9] Ion Stoica , Robert Morris , David Liben-Nowell, David Karger , M. Frans Kaashoek , Frank Dabek, and Hari Balakrishnan, ―Chord: a scalable peer-to-peer lookup protocol for internet applications‖, IEEE/ACM TRANSACTIONS ON NETWORKING, Vols. 11, No. 1, pp. 17-32, Feb. 2003. -[10] D. R. Karger, E. Lehman, F. Leighton, M. Levine, D. Lewin, and R. Panigrahy, ―Consistent hashing and random trees: Distributed caching protocols for relieving hot spots on the WorldWideWeb,‖ in Proc. 29th Annu. ACM Symp. Theory of Computing, El Paso, TX, May 1997, pp. 654–663. -[11] R. Cox, A. Muthitacharoen, and R. Morris, ―Serving DNS using Chord,‖ in Proc. 1st Int. Workshop Peer-to-Peer Systems , Cambridge, MA, Mar. 2002. -[12] F. Dabek, F. Kaashoek, D. R. Karger, R. Morris, and I. Stoica, ―Wide-area cooperative storage with CFS,‖ in Proc. ACM Symp. Operating Systems Principles, Banff, Canada, 2001, pp. 202–215. -[13] D. Liben-Nowell, H. Balakrishnan, and D. R. Karger, ―Analysis of the evolution of peer-to-peer systems,‖ in Proc. 21st ACM Symp. Principles of Distributed Computing (PODC), Monterey, CA, July 2002, pp. 233–242. -[14] D. R. Karger, E. Lehman, F. Leighton, M. Levine, D. Lewin, and R. Panigrahy, ―Consistent hashing and random trees: Distributed caching 160 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. -protocols for relieving hot spots on theWorldWideWeb,‖ in Proc. 29th Annu. ACM Symp. Theory of Computing, El Paso, TX, May 1997, pp. 654–663. -[15] D. Lewin, ―Consistent hashing and random trees: Algorithms for caching in distributed networks,‖ Master’s thesis, Department of Electric. Eng. Comput. Sci., Massachusetts Inst. Technol., Cambridge, 1998. -[16] ―Secure Hash Standard,‖ U.S. Dept. Commerce/NIST, National Technical Information Service, Springfield, VA, FIPS 180-1, Apr. 1995 -[17] Zoltán Lajos Kis, Róbert Szabó, ―Interconnected Chord-rings‖, Network Protocols and Algorithms, Vol. 2, No. 2, 2010. -161 -Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls deleted file mode 100644 index 72f6ee212878baefb8a042b43566089555c9e20b..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt deleted file mode 100644 index 5329a9c3146391448a018b5f161bc023245fd2e1..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt +++ /dev/null @@ -1,920 +0,0 @@ -mobile robot view project early detection&page&依赖 -Alzheimer&project& -mobile robot view project early detection&content&依赖 -Landmines detection&Mohammed Elmogy&依赖 -t�cnica vitivin�cola � july 2016 citation&READS&依赖 -a big data processing framework based&2,529 3 author&依赖 -mobile robot view project early detection&page&依赖 -Landmines detection&22 july 2016&依赖 -disease view project mohammed elmogy mansoura university 227 publication&1,801 citation&依赖 -a big data processing framework based&2,529 3 author&依赖 -mobile robot view project early detection&content&依赖 -author&publication&AGGREGATION -user&downloaded file&依赖 -user&enhancement&依赖 -enhancement&downloaded file&AGGREGATION -faculty and Egypt&Computers and Information&AGGREGATION -Faculty and Mansoura and Egypt&Computers and Information&AGGREGATION -mansoura university iot )&enormous storage challenge&依赖 -Internet&thing&AGGREGATION -Faculty&Computers and Information&AGGREGATION -IoT application&extensive development&依赖 -expansion&flow&依赖 -expansion&computational asset&AGGREGATION -flow&datum&AGGREGATION -expansion&datum&依赖 -expansion&significant effect&依赖 -vast flow&datum&AGGREGATION -vast flow&Big datum&依赖 -it&interesting information&依赖 -it&behavior and business intelligence&依赖 -user&behavior& -form&data resource&AGGREGATION -results and discussion&big datum iot-based smart application&依赖 -results and discussion&feasible solution&依赖 -we&clean noisy datum ( svd&依赖 -we&knn ) technique&依赖 -we&C-mean&依赖 -hybrid technique&C-mean&AGGREGATION -we&hybrid technique&依赖 -we&C-mean&依赖 -we&hybrid technique&依赖 -clustering technique&technique&GENERALIZATION -clustering technique&MapReduce model&实现 -MapReduce&most admit framework&依赖 -MapReduce&processing&依赖 -used technique&scalability&依赖 -it&huge dataset&依赖 -it&addition&依赖 -it&meaningful information&依赖 -accuracy&proposed framework&AGGREGATION -Internet iot )&thing&AGGREGATION -connection&variety&依赖 -connection&item&依赖 -connection&Internet&依赖 -connection&view information device&依赖 -variety&view information device&AGGREGATION -object&information&依赖 -object&information&依赖 -aim&perspective recognition&AGGREGATION -essential thought&thing&依赖 -essential thought&IoT&AGGREGATION -Figure 1&iot and big datum&依赖 -Figure 1&relationship&依赖 -sensor datum&big datum&依赖 -datum&IoT&依赖 -datum&most important part&依赖 -most important part&IoT&AGGREGATION -nature&IoT&AGGREGATION -billion&object&AGGREGATION -datum&IoT&依赖 -datum&sensor&依赖 -datum&nature&依赖 -various type&sensor&AGGREGATION -datum&various type&依赖 -It&discernment device&依赖 -datum&next challenges :&依赖 -datum&considered thing&依赖 -datum&next challenges :&依赖 -datum&considered thing&依赖 -huge number&discernment device&AGGREGATION -It&huge number&依赖 -massive scale&IoT&AGGREGATION -device&datum&依赖 -quick development&information scale&AGGREGATION -They&varied resources and heterogeneity&依赖 -They&IoT datum&依赖 -Different&observation gadget&AGGREGATION -varied resources and heterogeneity&IoT datum&AGGREGATION -gathered datum&different semantics and structure&依赖 -vast majority&IoT application&AGGREGATION -It&way&依赖 -IoT&communitarian&依赖 -IoT&data distribution&依赖 -2016 ) 4 interval&rescue vehicle&AGGREGATION -sort&assistant medical strategy&AGGREGATION -sort&what&依赖 -It&IoT&依赖 -It&principal issue&依赖 -application&IoT&AGGREGATION -It&application&依赖 -it&most part&依赖 -it&few sensor&依赖 -it&all while screen various pointer and dampness , light , and weight&依赖 -specimen information&line&依赖 -specimen information&information&GENERALIZATION -datum&volume&依赖 -they&mixed bag&依赖 -huge measure&dissimilar information&AGGREGATION -a thought or ideal model&gathering , and utilization&依赖 -Big datum&choice making&依赖 -Big datum&different universe&依赖 -Big datum&3 ]&依赖 -point&view&AGGREGATION -They&online networking&依赖 -They&sensor device&依赖 -big data 4v&volume , velocity , variety ,&依赖 -Figure 2&big data 4v&依赖 -It&big data sequence&依赖 -issues and technology&accessibility&依赖 -substantial volume&datum&AGGREGATION -accessibility&substantial volume&AGGREGATION -issues and technology&substantial volume&依赖 -organization&that&依赖 -issues and technology&datum&依赖 -rest&paper&AGGREGATION -Section 2&basic concept&依赖 -Section 3¤t related work&依赖 -Section 4&proposed system&依赖 -implementation result&proposed technique&AGGREGATION -implementation result&benchmark dataset&依赖 -implementation result&Section 5&依赖 -conclusion and future work&Section 6&依赖 -CONCEPTS&MapReduce&依赖 -one&perfect choice&AGGREGATION -perfect choice&programming paradigm&AGGREGATION -user&map function&依赖 -map function&pair&依赖 -map function&function&GENERALIZATION -map function&group&依赖 -pair&key-value&AGGREGATION -map function&intermediate key-value set&依赖 -group&intermediate key-value set&AGGREGATION -It&a reduce function&依赖 -MapReduce architecture&architecture&GENERALIZATION -MapReduce architecture&Figure 3&依赖 -MapReduce framework&framework&GENERALIZATION -reduce function&reduce function&实现 -MapReduce framework&large dataset&依赖 -mapper&mass&依赖 -mapper&datum&依赖 -mass&datum&AGGREGATION -reducer&intermediate result&依赖 -block diagram&MapReduce&AGGREGATION -block diagram&datum&依赖 -we&CSV extension&依赖 -we&data store&依赖 -we&dataset&依赖 -data store&datum&依赖 -data store&tabular datastore object&依赖 -we&' name&依赖 -we&dataset&依赖 -variables&names& -' name feature&working&依赖 -user&needs& -specified variable&need&AGGREGATION -' name feature&permit&依赖 -user&preview command&依赖 -Figure&generic map function&依赖 -function&coder&依赖 -We&intermediate key and intermediate value&依赖 -we&dataset&依赖 -we&specific value&依赖 -we&set&依赖 -we&key-value&依赖 -set&key-value&AGGREGATION -large dataset initialize datastore variable&large dataset&依赖 -Add&select variable&依赖 -Add&datastore&依赖 -block diagram&generic map function&AGGREGATION -c. map function datum key-value store subset term generic map function&intermediate key-value store add set&依赖 -intermediate key-value store add set&intermediate key&AGGREGATION -c. map function datum key-value store subset term generic map function&intermediate key&依赖 -condition&key-value pair Create output store&AGGREGATION -Map function&function&GENERALIZATION -partition&key-value condition&依赖 -partition&datum&AGGREGATION -Data Intermediate key-value store Map function&key-value pair Create output store&依赖 -Data Intermediate key-value store Map function&condition&依赖 -block diagram&map function&AGGREGATION -Figure 6&Map function&依赖 -Map function&table&依赖 -Map function&variable&依赖 -variables&property& -subset&selected key&依赖 -subset&condition value&依赖 -subset&dataset&AGGREGATION -condition value&value&GENERALIZATION -map function extract&subset&依赖 -condition value&selected key&AGGREGATION -map function extract&dataset&依赖 -Reduce&one key&依赖 -block diagram&of&AGGREGATION -block diagram&reduce function&依赖 -load&different point&AGGREGATION -we&one&依赖 -separation&Epsilon&AGGREGATION -piece&" cluster&AGGREGATION -one&them&AGGREGATION -we&them&依赖 -they&Create&依赖 -we&cluster&依赖 -more than minpoint point&Intermediate value Key-value store&AGGREGATION -greater part&new point&AGGREGATION -they&more than minpoint point&依赖 -Get all intermediate result&intermediate value&依赖 -output value&value&GENERALIZATION -Get all intermediate result&output value&依赖 -2016 ) 9 separation&epsilon&AGGREGATION -part&other group&AGGREGATION -its&Epsilon& -it&other group&依赖 -minpoint point&Epsilon&依赖 -minpoint point&Epsilon&依赖 -noise&point&GENERALIZATION -it&" noise point "&依赖 -whose&items& -each core-point c&edge&依赖 -each core-point c&edge&依赖 -each core-point c&c&依赖 -each core-point c&c&依赖 -each core-point c&edge&依赖 -each core-point c&edge&依赖 -each core-point c&c&依赖 -- neighborhood&c 3&AGGREGATION -each core-point c&edge&依赖 -each core-point c&c&依赖 -each core-point c&c&依赖 -item&graph&AGGREGATION -core point&point&GENERALIZATION -let x&1&依赖 -set&node&AGGREGATION -dataset&n cluster&依赖 -data point&cluster&依赖 -high level&relationship&AGGREGATION -cluster&relationship&依赖 -cluster&cluster&依赖 -cluster&high level&依赖 -data point&point&GENERALIZATION -data point&cluster&依赖 -data point&association&依赖 -data point&cluster&依赖 -low level&association&AGGREGATION -data point&low level&依赖 -center&cluster&AGGREGATION -technique&pattern recognition&依赖 -It&minimization&依赖 -It&objective function&依赖 -minimization&objective function&AGGREGATION -membership&xus&AGGREGATION -cj&following equation [ 6 ]&依赖 -cj¢er&依赖 -cj&cj = σn i = 1 ( mm ij&依赖 -| | * | |&measured datum&依赖 -ith&d-dimensional measured datum&AGGREGATION -degree&membership&AGGREGATION -cj&cluster&依赖 -d-dimension center&cluster&AGGREGATION -FCM sequentially&dataset&依赖 -FCM sequentially&right area&依赖 -FCM sequentially&cluster center&依赖 -FCM clustering strategy&fuzzy behavior&依赖 -they&method&依赖 -fuzzy behavior&issn :0254 -0223 vol&依赖 -7&clustering&依赖 -membership weight&a characteristic translation but not probabilistic&依赖 -membership weight&all&依赖 -outcome&item&依赖 -outcome&2.4 K&依赖 -estimation&item&AGGREGATION -outcome&KNN regression&依赖 -its&neighbors& -value&k-closest neighbor&依赖 -value&estimation&依赖 -estimation&k-closest neighbor&AGGREGATION -Euclidean distance&distance&GENERALIZATION -KNN&Euclidean distance&依赖 -KNN&labeled example&依赖 -KNN&Euclidean distance&依赖 -KNN&following equation [ 8 ]&依赖 -it&overall noise&依赖 -top K-number&adjacent neighbor&AGGREGATION -labeled example&highest distance&依赖 -It&detail&依赖 -n row&datum&依赖 -2.5 SINGULAR VALUE DECOMPOSITION SVD&datum&依赖 -rectangular matrix&datum&AGGREGATION -p column&experimental property&依赖 -2.5 SINGULAR VALUE DECOMPOSITION SVD&rectangular matrix&依赖 -same dimension&singular value&依赖 -VT&row&依赖 -SVD&outline&依赖 -SVD&coordinate system&依赖 -coordinate system&system&GENERALIZATION -SVD&original datum&依赖 -outline&original datum&AGGREGATION -eigenvector&a relate to&AGGREGATION -SVD&equation&依赖 -x&a relate to&依赖 -eigenvalue&A&AGGREGATION -eigenvalues and eigenvector&AAT or ATA&AGGREGATION -computation&SVD&AGGREGATION -singular value&AAT or ATA&依赖 -column&V&AGGREGATION -column&U.&AGGREGATION -eigenvector&column&依赖 -singular value&AAT or ATA&依赖 -eigenvector&V&依赖 -singular value&AAT or ATA&依赖 -eigenvector&U.&依赖 -singular value&eigenvalue&依赖 -square root&eigenvalue&AGGREGATION -singular value&AAT or ATA&依赖 -singular value&eigenvalue&依赖 -eigenvector&ATA&AGGREGATION -eigenvector&column&依赖 -eigenvector&AAT&AGGREGATION -diagonal entry&S matrix&AGGREGATION -S matrix&matrix&GENERALIZATION -singular value&S matrix&依赖 -SVD feature&matrix&依赖 -SVD feature&nearest rank-l estimation&依赖 -number&outstanding singular value&AGGREGATION -we&matrix estimation&依赖 -whose&rank& -whose rank&outstanding singular value&依赖 -whose rank&number&依赖 -7&important research topic&依赖 -Many researcher&field&依赖 -MapReduce technique&technique&GENERALIZATION -Tao&MapReduce technique&依赖 -light&K-means clustering calculation&AGGREGATION -They&light&依赖 -They&K-means clustering calculation&依赖 -They&monstrous little datum&依赖 -They&procedure&依赖 -outcome&information prepare proficiency&依赖 -Their&outcomes& -They&Kmeans calculation&依赖 -They&Kmeans calculation&依赖 -They&MapReduce&依赖 -They&view&依赖 -view&MapReduce&AGGREGATION -they&datum&依赖 -they&record&依赖 -they&converging&依赖 -converging&datum&AGGREGATION -they&cluster&依赖 -datum&high likeness&依赖 -datum&high likeness&依赖 -merger technique&technique&GENERALIZATION -merger technique&little information&AGGREGATION -exploration&little information&依赖 -exploration&them&依赖 -exploration&IoT&依赖 -exploration&merger technique&依赖 -number&cluster&AGGREGATION -Xu and Xun [ 11 ]&distributed computing&依赖 -MapReduce model&distributed computing&AGGREGATION -Xu and Xun [ 11 ]&MapReduce model&依赖 -they&MapReduce&依赖 -instrument&MapReduce&AGGREGATION -they&instrument&依赖 -key innovation&IoT&AGGREGATION -they&structural planning attribute&依赖 -They&IoT world&依赖 -IoT world&world&GENERALIZATION -They&information and datum&依赖 -They&conveyed mining&依赖 -they&stream information distribution&依赖 -deficiency&conventional Apriori calculation&AGGREGATION -Apriori&lower mining proficiency&依赖 -mining technique&stream information investigation , group and so on&依赖 -mining technique&technique&GENERALIZATION -mining technique&stream information investigation , group and so on&依赖 -mining technique&stream information investigation , group and so on&依赖 -They&system&依赖 -security&information&AGGREGATION -proposed system&low effectiveness&依赖 -its&usage& -Wang et al. [ 12 ]&structural planning&依赖 -Wang et al. [ 12 ]&agribusiness&依赖 -structural planning&IoT&AGGREGATION -Wang et al. [ 12 ]&IoT&依赖 -IoT&distributed processing&依赖 -structural planning&enormous sensor information&依赖 -sensor information&information&GENERALIZATION -structural planning&constant read or access&依赖 -XML document&standard&依赖 -organization&heterogeneous sensor datum&AGGREGATION -XML document&heterogeneous sensor datum&依赖 -XML document&standard&依赖 -XML document&organization&依赖 -lack&variety&AGGREGATION -variety&sensor datum&AGGREGATION -ClustBigFIM method&method&GENERALIZATION -Gole and Tidk [ 13 ]&ClustBigFIM method&依赖 -improvement&BigFIM algorithm&AGGREGATION -ClustBigFIM&BigFIM algorithm&依赖 -improvement&information&依赖 -improvement&velocity&依赖 -other data mining mission&good vision&依赖 -They&manner&依赖 -manner&association&AGGREGATION -They&association&依赖 -It&frequent item&依赖 -flow&information&AGGREGATION -It&Big datum&依赖 -Li et al. [ 1 ]&storage managing clarification&依赖 -storage managing clarification&managing clarification&GENERALIZATION -They&managing clarification&依赖 -IOTMDB&save&依赖 -Their&work& -they&addition&依赖 -they&massive IoT data ISSN :0254 -0223 Vol&依赖 -its&value& -diverse structure&sensor&依赖 -Mesiti and Valtolina [ 14 ]&structure&依赖 -diverse structure&sensor&依赖 -information accumulation&database&依赖 -they&answer&依赖 -world&Big information investigation strategy&依赖 -answer&information&依赖 -answer&heterogeneous sensor&依赖 -NoSQL framework&framework&GENERALIZATION -NoSQL framework&reasonable mapping&依赖 -They&easy to use loading framework&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 -zhan et al. [ 15 ]&massive data processing model&依赖 -type&data resource&AGGREGATION -Their&model& -They&two main point&依赖 -they&cloudf&依赖 -they&Cloud Manager DB&实现 -variety&datum&AGGREGATION -galache et al. [ 16 ]&galache et al. [ 16 ]&依赖 -galache et al. [ 16 ]&galache et al. [ 16 ]&依赖 -nation&city resource&依赖 -Their&issue& -they&care&依赖 -they&resource&依赖 -they&resource&依赖 -they&resource&依赖 -they&resource&依赖 -they&care&依赖 -care&resource&AGGREGATION -they&care&依赖 -they&care&依赖 -set&smart IoT service&AGGREGATION -proposed framework&three-layer architecture&依赖 -asset&effective IoT benefit&依赖 -asset&Cloud&依赖 -Sowe et al.&answer&依赖 -Sowe et al.&massive heterogeneous sensor information issue&依赖 -They&join&依赖 -distinctive type&information&AGGREGATION -It&key middleware&依赖 -It&a service-controlled networking ( scn )&依赖 -It&sensor information&依赖 -It&client&依赖 -They&harvester ( udh ) advancement&依赖 -They&SCN&依赖 -portable detecting information&paper&依赖 -They&structure&依赖 -Cecchinel et al. [ 18 ]&programming structure&依赖 -programming structure&structure&GENERALIZATION -structure&dataset&依赖 -utilization measure&dataset&AGGREGATION -dataset&SMARTCAMPUS venture&依赖 -structural engineering&SMARTCAMPUS venture&依赖 -Their&engineering& -structural engineering&genuine prerequisite&依赖 -work&Big data accumulation stage&依赖 -work&way&依赖 -work&Big data accumulation stage&依赖 -way&Big data accumulation stage&AGGREGATION -work&way&依赖 -They&ISSN :0254 -0223 Vol&依赖 -its&applications& -programming model&client&依赖 -system programming&off chance&依赖 -they&new information&依赖 -Mishra et al. [ 19 ]&valuable data administration and knowledge detection&依赖 -Mishra et al. [ 19 ]&IoT Big datum&依赖 -Mishra et al. [ 19 ]&a cognitive-oriented iot big-da framework ( coib )&依赖 -They&huge scale mechanical computerization environment&依赖 -They&general IoT Big data layered design&依赖 -They&COIB system&依赖 -usage&COIB system&AGGREGATION -They&general IoT Big data layered design&依赖 -COIB system&system&GENERALIZATION -They&mining and examination huge information&依赖 -proposed system&store and retrieve iot big datum&依赖 -trillion&IoT item&AGGREGATION -proposed system&solution&依赖 -their&work& -our&system& -accuracy&datum&AGGREGATION -proposed system&datum&依赖 -proposed system&datum&依赖 -proposed system&massive number&依赖 -massive number&datum&AGGREGATION -proposed system&massive number&依赖 -we&datum&依赖 -we&noise&依赖 -we&big datum&依赖 -we&kennard sample and svd&依赖 -data reduction technique&big datum&依赖 -data reduction technique&IoT&依赖 -we&IoT&依赖 -we&mutual information algorithm&依赖 -we&datum clustering&依赖 -we&vast store&依赖 -we&MapReduce&依赖 -proposed system figure 8&proposed system figure 8&依赖 -proposed system&massive – heterogeneous sensor datum&AGGREGATION -proposed system&massive – heterogeneous sensor datum&依赖 -noiseless datum issn :0254 -0223 vol&noiseless datum issn :0254 -0223 vol&依赖 -Variety sensor raw datum datum cleaning data integration datum processing ( clustering&sensor raw datum datum cleaning data integration datum processing ( clustering&AGGREGATION -Variety sensor raw datum datum cleaning data integration datum processing ( clustering&little size clean&依赖 -Variety sensor raw datum datum cleaning data integration datum processing ( clustering&datum datum&依赖 -Variety sensor raw datum datum cleaning data integration datum processing ( clustering&datum datum&依赖 -Variety sensor raw datum datum cleaning data integration datum processing ( clustering&little size clean&依赖 -proposed system&data preprocessing and data processing phase&依赖 -proposed system&two main phase&依赖 -dataset&stage&依赖 -dataset&different sensor&依赖 -utilization&kennard sampling&AGGREGATION -dimensionality&datum&AGGREGATION -execution time&datum processing&AGGREGATION -last stage&correlation and mutual information&依赖 -last stage&aiming&依赖 -it&data distribution&依赖 -performance&big datum&AGGREGATION -main stage&detail&依赖 -main stage&subsection&依赖 -main stage&two phase&AGGREGATION -preprocessing&data science&依赖 -it&choice&依赖 -data mining method&raw datum&依赖 -data mining method&reasonable information&依赖 -It&association&依赖 -It&database-driven application&依赖 -step&detail&依赖 -step&subsection&依赖 -confusion&real information&依赖 -more than 30 %&real information&AGGREGATION -confusion&more than 30 %&依赖 -it&addition&依赖 -it&cost [ 21 ]&依赖 -It&datum&依赖 -rest&datum&AGGREGATION -it&size&依赖 -nominal attribute&datum&AGGREGATION -It&datum&依赖 -help&many technique&AGGREGATION -mean&numeric attribute or mode&AGGREGATION -It&KNN algorithm&依赖 -It&discrete and continuous attribute&依赖 -knn search&datum&依赖 -It&dataset&依赖 -It&most probable value&依赖 -We&data cleaning&依赖 -We&KNN algorithm&依赖 -block diagram&datum cleaning step&AGGREGATION -block diagram&datum cleaning step&依赖 -figure 9 show&noisy data and outlier&依赖 -figure 9 show&many challenge&依赖 -repetition&datum&AGGREGATION -value&KNN regression&依赖 -value&most probable value&依赖 -b ) data reduction a monstrous measure&different source&依赖 -b ) data reduction a monstrous measure&different source&依赖 -logistics insight&example&依赖 -logistics insight&r&d [ 23 ]&依赖 -logistics insight&r&d [ 23 ]&依赖 -b ) data reduction a monstrous measure&information&AGGREGATION -extraordinary difficulty term&computational manysided quality and characterization execution&AGGREGATION -Highdimensional information&computational manysided quality and characterization execution&依赖 -Highdimensional information&extraordinary difficulty term&依赖 -it&low-dimensional component space&依赖 -block diagram&data reduction step&依赖 -block diagram&data reduction step&依赖 -list&highest smallest distance&AGGREGATION -Kennard sample&time&依赖 -Kennard sample&number&依赖 -Kennard sample&iteration&依赖 -number&iteration&AGGREGATION -We&SVD&依赖 -dimensionality&large dimensional datum&AGGREGATION -SVD Input data De-duplication Detect outlier Replace&value&依赖 -SVD Input data De-duplication Detect outlier Replace&Input datum&依赖 -purpose&access&AGGREGATION -Big datum&huge volume&依赖 -Big datum&organization&依赖 -number&different source&AGGREGATION -It&diverse structure&依赖 -It&–&依赖 -this immense , various sort&information&AGGREGATION -organization&speedy , exact , and significant bit&依赖 -organization&knowledge [ 26 ]&依赖 -speedy , exact , and significant bit&knowledge [ 26 ]&AGGREGATION -Mutual information&relationship&依赖 -Mutual information&attribute&依赖 -( y ) ] ( 10 )&( x&依赖 -( y ) ] ( 10 )&[ 27 ]&依赖 -equation&mutual information&AGGREGATION -( y ) ] ( 10 )&y ) log2 [ p ( x&依赖 -two dimension&dataset&AGGREGATION -X and Y&dataset&依赖 -control&information processing&AGGREGATION -information processing&processing&GENERALIZATION -Data Processing Phase Data processing phase&information processing&依赖 -handling&information&AGGREGATION -Information preparation&handling&依赖 -Information preparation&information&依赖 -Massive datum&processing&依赖 -Massive datum&data store&依赖 -tremendous measure&comparative quality&AGGREGATION -We&MapReduce&依赖 -hybrid&FCM and DBSCAN&AGGREGATION -We&MapReduce&依赖 -We&MapReduce&依赖 -minimum point&minimum value&依赖 -minimum value&point&AGGREGATION -FCM-DBSCAN Map function&Map function&GENERALIZATION -minimum point&point&依赖 -we&FCM-DBSCAN Map function&依赖 -epsilon value¢er and point&依赖 -epsilon value&distance&依赖 -minimum point&cluster&依赖 -we&minimum point&依赖 -we&equation&依赖 -points and center&cluster&AGGREGATION -we¢er&依赖 -we&cluster&依赖 -epsilon value&value&GENERALIZATION -point and center&cluster equal&AGGREGATION -distance&greater&依赖 -distance&greater&依赖 -distance&epsilon value&依赖 -point&cluster&依赖 -point&neighborpt&依赖 -distance&epsilon value&依赖 -distance&greater&依赖 -distance&epsilon value&依赖 -point&cluster&依赖 -We&key&依赖 -It&reach&依赖 -It&convergence state&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 -7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 -each point p&dataset D&依赖 -each point p&each point p&依赖 -each point p&dataset D&依赖 -each point p&each point p&依赖 -each point p&each point p&依赖 -each point p&dataset D&依赖 -input&FCM-DBSCAN Reduce function&依赖 -final cluster point&C cluster&依赖 -final cluster point&previous cluster point&依赖 -final cluster point&addition&依赖 -C cluster&cluster&GENERALIZATION -final cluster point¤t cluster point&依赖 -a point issn&:0254 -0223 Vol&依赖 -all point&cluster&依赖 -all point&cluster&依赖 -all point&cluster&依赖 -neighbor point&minimum point&依赖 -minimum point&point&GENERALIZATION -neighbor point&neighbor point&依赖 -output&datum&依赖 -output&cluster&依赖 -cluster&datum&AGGREGATION -set&cluster&AGGREGATION -raw datum&different sensor&依赖 -problem&sensor datum&依赖 -our propose work aim&problem&依赖 -Our&work& -raw datum&different sensors and store&依赖 -we&datum&依赖 -datum&noise&依赖 -datum&KNN&依赖 -We&KNN&依赖 -cleared datum&SVD algorithm&依赖 -significant vision&datum&AGGREGATION -it&time&依赖 -proposed model&data processing step&依赖 -clustering algorithm&entity&依赖 -clustering algorithm&space&依赖 -arrangement&entity&AGGREGATION -clustering algorithm&arrangement&依赖 -It&diverse forms and size&依赖 -It&cluster&依赖 -It&diverse forms and size&依赖 -It&cluster&依赖 -It&diverse forms and size&依赖 -huge quantity&datum&AGGREGATION -It&diverse forms and size&依赖 -cluster&diverse forms and size&AGGREGATION -It&cluster&依赖 -It&cluster&依赖 -RESULTS&29 ]&依赖 -RESULTS&ordinary IADL housekeeping activity&依赖 -general interval&dataset&AGGREGATION -usual spreading&activity&AGGREGATION -interval&activity&依赖 -interval&usual spreading&依赖 -interval&daily life&依赖 -Porcupine sensor&sensor&GENERALIZATION -They&ibracelet&依赖 -They&acceleration and RFID tag detection&依赖 -They&Porcupine sensor&依赖 -dataset&estimation&依赖 -estimation&1048576 record&AGGREGATION -dataset&1048576 record&依赖 -We&proposed technique and core ( tm )&实现 -We&2 due , 2 gh processor&实现 -part&used dataset&AGGREGATION -Figure 11&part&依赖 -Figure 11&used dataset&依赖 -act&activity label result&依赖 -act&iron&依赖 -Acc&acc3&依赖 -beginning&recording&AGGREGATION -Time&elapsed number&依赖 -Time&second&依赖 -elapsed number&second&AGGREGATION -Acc&real time clock [ ddmmyyhhmmss ]&依赖 -Time&elapsed number&依赖 -Time&second&依赖 -Time&second&依赖 -Time&elapsed number&依赖 -Figure 12&outlier detection&依赖 -value&state&依赖 -value&outlier&依赖 -value&state&依赖 -value&outlier&依赖 -value&field&AGGREGATION -observation&value&依赖 -expected scope&value&AGGREGATION -observation&experiment&依赖 -outlier&measurement or experimental error indication&依赖 -outlier&dataset&依赖 -outlier&value&依赖 -figure 13 show&value&依赖 -reduction&dataset&AGGREGATION -datum&property&依赖 -smaller number&property&AGGREGATION -datum&smaller number&依赖 -attribute&priority&依赖 -attribute&priority&依赖 -SVD1&present the datum&依赖 -SVD1&highest probability&依赖 -outcome matrix&matrix&GENERALIZATION -Figure 15&outcome matrix&依赖 -Figure 15&mutual information&依赖 -measure&two variable&依赖 -measure&two variable&依赖 -measure&variables mutual dependence&AGGREGATION -trans-information&two variable&AGGREGATION -mutual information&association or correlation&依赖 -rate&association or correlation&AGGREGATION -mutual information&row and column variable&依赖 -mutual information&rate&依赖 -mutual information&2N&依赖 -mutual information&datum&依赖 -it&high relationship&依赖 -value&mutual information&AGGREGATION -it&attribute&依赖 -MapReduce function execution&MapReduce implementation&依赖 -MapReduce function execution&result datum&依赖 -Figure 16&read datum&依赖 -read datum&resulted attributes view&依赖 -read datum&resulted attributes view&依赖 -read datum&set&依赖 -Figure 16&MapReduce&依赖 -Figure 16&MapReduce&依赖 -read datum&set&依赖 -Figure 16&read datum&依赖 -set&resulted attributes view&AGGREGATION -Figure 17&MapReduce implementation&依赖 -dataset&Map&依赖 -we&MapReduce implementation&依赖 -we&data result&依赖 -preprocessing&dataset&AGGREGATION -time and accuracy&dataset&AGGREGATION -5.4 EVALUATION The evaluation&dataset&依赖 -5.4 EVALUATION The evaluation&time and accuracy&依赖 -time and accuracy&preprocessing&AGGREGATION -value&specificity&AGGREGATION -we&Big datum&依赖 -we&accuracy&依赖 -accuracy&Big datum&AGGREGATION -negative tuple&FN False negative&依赖 -positive tuple&ISSN :0254 -0223 Vol&依赖 -negative tuple&TN True negative&依赖 -positive tuple&FP False Positives&依赖 -our&FCM-DBSCAN& -clustering algorithm&PCA&依赖 -clustering algorithm&different data reduction algorithm&依赖 -we&table 2&依赖 -we&dataset&依赖 -we&proposed approach&依赖 -we&training data and testing datum&依赖 -we&tested datum&依赖 -performance measure&proposed system&AGGREGATION -expended time comparison&different reduction algorithm&依赖 -expended time comparison&different reduction algorithm&依赖 -we&high accuracy value&依赖 -its&approaches& -our&studies& -FCM-DBSCAN&accuracy&依赖 -FCM-DBSCAN&highest value&依赖 -FCM-DBSCAN&accuracy&依赖 -FCM-DBSCAN&highest value&依赖 -highest value&accuracy&AGGREGATION -K-Means and optics&nearest accuracy value&依赖 -optics&longer time&依赖 -EM algorithm&other technique&依赖 -EM algorithm&larger time&依赖 -DBSCAN&high accuracy&依赖 -accuracy&FCM-DBSCAN&依赖 -vast increase&device&AGGREGATION -massive amount&IoT datum&AGGREGATION -Big datum&massive datum&依赖 -massive datum&much time&依赖 -We&processing massive and heterogeneous datum&依赖 -We&IoT&依赖 -We&framework&依赖 -paper&Big datum&依赖 -paper&many viewpoint&依赖 -raw dataset&different sensor&依赖 -Our&system& -proposed system&problem&依赖 -architecture&optics em dbscan fcm-dbscan pca pca kernel ica som svd issn :0254 -0223 vol&依赖 -architecture&proposed system&AGGREGATION -architecture&optics em dbscan fcm-dbscan pca pca kernel ica som svd issn :0254 -0223 vol&依赖 -we&preprocessing phase&依赖 -datum&most probable value&依赖 -we&KNN&依赖 -MapReduce model&datum clustering&依赖 -MapReduce model&datum clustering&依赖 -MapReduce model&Map and Reduce function&依赖 -MapReduce model&Map and Reduce function&依赖 -MapReduce model&datum clustering&依赖 -MapReduce model&datum clustering&依赖 -MapReduce model&Map and Reduce function&依赖 -MapReduce model&Map and Reduce function&依赖 -processing time&proposed system&AGGREGATION -we&processing&实现 -we&processing&实现 -we&different dataset&实现 -we&different dataset&实现 -future work&time&依赖 -we&data query processing&实现 -best and suitable model&NoSQL database&AGGREGATION -NoSQL database&database&GENERALIZATION -we&NoSQL database&实现 -we&best and suitable model&实现 -We&Key-value database&依赖 -Key-value database&database&GENERALIZATION -key-value ( kv ) store&associative array&依赖 -approach&selective key range&依赖 -we&challenge&依赖 -[&1 ] li , t. , liu , y. , tian , y. , shen , s. , & mao and w. ( 2012 ) w. ( 2012 )&依赖 -Improvement&Analyze Cluster&依赖 -Improvement&Large dataset&依赖 -Improvement&dbscan algorithm&AGGREGATION -Improvement&Large dataset&依赖 -Improvement&Large dataset&依赖 -Improvement&Large dataset&依赖 -Improvement&Analyze Cluster&依赖 -Improvement&Analyze Cluster&依赖 -Improvement&Analyze Cluster&依赖 -Comparative Analysis&k-mean&AGGREGATION -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&c. ( 2014 )&依赖 -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&7 jan 2016&依赖 -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&c. ( 2014 )&依赖 -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&依赖 -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&7 jan 2016&依赖 -http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&依赖 -Management&Big Data&AGGREGATION -analysis&big datum&AGGREGATION -38th ieee annual international computers and Software&38th ieee annual international computers and Software&依赖 -management&massive IoT datum&AGGREGATION -collection&big datum&AGGREGATION -cognitive-oriented framework&iot big-data management prospective&依赖 -cognitive-oriented framework&iot big-data management prospective&依赖 -International Journal and ijact ) , 7 ( 5 ) and ijact ) , 7 ( 5 )&Advancements&AGGREGATION -new approachµarray data dimension reduction&AGGREGATION -data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&8 july 2015&依赖 -data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&http://data-informed.com/how-to-address-commonbig&依赖 -data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&8 july 2015&依赖 -data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&http://data-informed.com/how-to-address-commonbig&依赖 -element&information theory&AGGREGATION -Combination&RFID&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt deleted file mode 100644 index 80c99c07f0aa3295882dd0ec4cedf029dc5953fc..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt +++ /dev/null @@ -1,74 +0,0 @@ -Input,Input -Input,class TextInputFormat -Input,class SequenceFileInputFormat -Input,class CombineFileInputFormat -Input,class KeyValueTextInputFormat -Input,class FixedLengthInputFormat -Input,class NLineInputFormat -Input,class CombineFileRecordReader -Input,class KeyValueLineRecordReader -Input,class SequenceFileRecordReader -Input,class DBRecordReader -TextInputFormat,TextInputFormat -SequenceFileInputFormat,SequenceFileInputFormat -CombineFileInputFormat,CombineFileInputFormat -KeyValueTextInputFormat,KeyValueTextInputFormat -KeyValueTextInputFormat,KeyFilter -KeyValueTextInputFormat,Key questions -FixedLengthInputFormat,FixedLengthInputFormat -NLineInputFormat,NLineInputFormat -CombineFileRecordReader,CombineFileRecordReader -KeyValueLineRecordReader,KeyValueLineRecordReader -KeyValueLineRecordReader,Key questions -SequenceFileRecordReader,SequenceFileRecordReader -DBRecordReader,DBRecordReader -Map,Map -InverseMapper,InverseMapper -MultithreadedMapper,MultithreadedMapper -RegexMapper,RegexMapper -TokenCounterMapper,TokenCounterMapper -Partition,Partition -Partition,class KeyFieldBasedPartitioner -BinaryPartitioner,BinaryPartitioner -HashPartitioner,HashPartitioner -HashPartitioner,default Partitioner -KeyFieldBasedPartitioner,KeyFieldBasedPartitioner -KeyFieldBasedPartitioner,Key idea -RehashPartitioner,RehashPartitioner -TotalOrderPartitioner,TotalOrderPartitioner -Reduce,Reduce -Reduce,class IntSumReducer -Reduce,class LongSumReducer -Reduce,class FailJob -IntSumReducer,IntSumReducer -IntSumReducer,Reducer interfaces -IntSumReducer,ReducerFactory -IntSumReducer,Reducer aggregate -IntSumReducer,ReducerPhase -IntSumReducer,Reducer implementations -LongSumReducer,LongSumReducer -LongSumReducer,Reducer interfaces -LongSumReducer,ReducerFactory -LongSumReducer,Reducer aggregate -LongSumReducer,ReducerPhase -LongSumReducer,Reducer implementations -Output,Output -Output,class FileOutFormat -Output,class MapFileOutputFormat -Output,class SequenceFileOutputFormat -Output,class TextOutputFormat -Output,class MultipleOutputs -Output,class FileOutputCommitter -Output,class RecordWriter -MapFileOutputFormat,MapFileOutputFormat -MapFileOutputFormat,method Map -MapFileOutputFormat,Map Reduce papers -MapFileOutputFormat,MapTask -MapFileOutputFormat,FacebookMap -Map,class InverseMapper -Map,class MultithreadedMapper -Map,class RegexMapper -Map,class TokenCounterMapper -Map,class WrappedMapper -SequenceFileOutputFormat,SequenceFileOutputFormat -TextOutputFormat,TextOutputFormat diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt deleted file mode 100644 index 88c9d8c94333d163ebd0526cefd8092f6fc7e556..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt +++ /dev/null @@ -1,74 +0,0 @@ -Input , Input -Input , class Text Input Format -Input , class Sequence File Input Format -Input , class Combine File Input Format -Input , class Key Value Text Input Format -Input , class Fixed Length Input Format -Input , class N Line Input Format -Input , class Combine File Record Reader -Input , class Key Value Line Record Reader -Input , class Sequence File Record Reader -Input , class D B Record Reader -Text Input Format , Text Input Format -Sequence File Input Format , Sequence File Input Format -Combine File Input Format , Combine File Input Format -Key Value Text Input Format , Key Value Text Input Format -Key Value Text Input Format , Key Filter -Key Value Text Input Format , Key questions -Fixed Length Input Format , Fixed Length Input Format -NLine Input Format , NLine Input Format -Combine File Record Reader , Combine File Record Reader -Key Value Line Record Reader , Key Value Line Record Reader -Key Value Line Record Reader , Key questions -Sequence File Record Reader , Sequence File Record Reader -DBRecord Reader , DBRecord Reader -Map , Map -Inverse Mapper , Inverse Mapper -Multithreaded Mapper , Multithreaded Mapper -Regex Mapper , Regex Mapper -Token Counter Mapper , Token Counter Mapper -Partition , Partition -Partition , class Key Field Based Partitioner -Binary Partitioner , Binary Partitioner -Hash Partitioner , Hash Partitioner -Hash Partitioner , default Partitioner -Key Field Based Partitioner , Key Field Based Partitioner -Key Field Based Partitioner , Key idea -Rehash Partitioner , Rehash Partitioner -Total Order Partitioner , Total Order Partitioner -Reduce , Reduce -Reduce , class Int Sum Reducer -Reduce , class Long Sum Reducer -Reduce , class Fail Job -Int Sum Reducer , Int Sum Reducer -Int Sum Reducer , Reducer interfaces -Int Sum Reducer , Reducer Factory -Int Sum Reducer , Reducer aggregate -Int Sum Reducer , Reducer Phase -Int Sum Reducer , Reducer implementations -Long Sum Reducer , Long Sum Reducer -Long Sum Reducer , Reducer interfaces -Long Sum Reducer , Reducer Factory -Long Sum Reducer , Reducer aggregate -Long Sum Reducer , Reducer Phase -Long Sum Reducer , Reducer implementations -Output , Output -Output , class File Out Format -Output , class Map File Output Format -Output , class Sequence File Output Format -Output , class Text Output Format -Output , class Multiple Outputs -Output , class File Output Committer -Output , class Record Writer -Map File Output Format , Map File Output Format -Map File Output Format , method Map -Map File Output Format , Map Reduce papers -Map File Output Format , Map Task -Map File Output Format , Facebook Map -Map , class Inverse Mapper -Map , class Multithreaded Mapper -Map , class Regex Mapper -Map , class Token Counter Mapper -Map , class Wrapped Mapper -Sequence File Output Format , Sequence File Output Format -Text Output Format , Text Output Format \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt deleted file mode 100644 index e9490149a471f323efc9f15d4325c58cb4176a0d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt +++ /dev/null @@ -1,898 +0,0 @@ -See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/305489358 -A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH -APPLICATION TO INTERNET OF THINGS -Article in Ciência e Técnica Vitivinícola · July 2016 -CITATIONS -0 -READS -2,529 -3 authors, including: -Some of the authors of this publication are also working on these related projects: -Landmines detection using mobile robots View project -Early Detection for Alzheimer's Disease View project -Mohammed Elmogy -Mansoura University -227 PUBLICATIONS 1,801 CITATIONS -SEE PROFILE -All content following this page was uploaded by Mohammed Elmogy on 22 July 2016. -The user has requested enhancement of the downloaded file. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -2 -A BIG DATA PROCESSING FRAMEWORK -BASED ON MAPREDUCE WITH APPLICATION -TO INTERNET OF THINGS -1Heba, A., 2 Mohammed, E., 3 Shereif, B. -1 Information Systems Dept., Faculty of Computers and Information, -Mansoura University Mansoura, Egypt, hebaaly92@gmail.com -*2 Information Technology Dept., Faculty of Computers and Information, -Mansoura University, Mansoura, Egypt, melmogy@mans.edu.eg -3 Information Systems Dept., Faculty of Computers and Information, -Mansoura University, Mansoura, Egypt, sherifiib@yahoo.com -ABSTRACT -Massive and various data from the Internet of Things (IoT) generate enormous storage challenges. The -IoT applications caused an extensive development. In the past two decades, the expansion of -computational asset had a significant effect on the flow of the data. The vast flow of data is identified as -"Big data," which is the data that cannot be managed using current ordinary techniques or tools. If it is -correctly handled, it generates interesting information, such as investigating the user's behavior and -business intelligence. -In this paper, the proposed system is implemented to handle massive data with all forms of data -resources whether structured, semi-structured, and non-structured altogether. The results and discussion -show that the proposed system generates a feasible solution in applying big data IoT-based smart -applications. In the data preprocessing stage, we used the K-nearest neighbors (KNN) technique to clean -noisy data and a Singular Value Decomposition (SVD) to reduce data dimensionality. In the processing -stage, we proposed a hybrid technique of a Fuzzy C-mean and Density-based spatial clustering (FCMDBSCAN) -to deal with the applications with noise. The clustering technique is implemented on -MapReduce model. MapReduce is represented as the most admitted framework to operate processing on -big data. The MapReduce is the most principle model to deal with big data. The used technique is -providing scalability, rapidity, and well-fitting accuracy for storing big data. In addition, it is obtaining -meaningful information from huge datasets that give great vision to make effective outcomes using fast -and efficient processing platform. Experimental results show that the accuracy of the proposed -framework is 98.9% using IADL activities dataset. -KEYWORDS: Internet of Things (IoT); Big data; Singular Value Decomposition (SVD); FCMDBSCAN; -MapReduce. -1. INTRODUCTION -The IoT is the connection that joins items to the Internet over varieties of view -information devices. Therefore, all objects that can be addressed separately can -interchange information among each other, and eventually realize the aims of -perspective recognition, location, tracking, supervision, and administration [1]. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -3 -Figure 1. The Big data in IoT. -The essential thought of IoT is to interface all things on the planet to the Web. -It is normal that things can be recognized automatically, can speak with each -other, and also can even settle on choices without human interference [2]. Figure -1 shows the relationship between IoT and big data and how the sensor data -represented as big data. -Data are a standout amongst the most important parts of the IoT. In the nature -of the IoT, data are gathered from various types of sensors and speak to billions -of objects. In all considered things, the data on the IoT display the next -challenges: -- The massive scale of the IoT: It includes a huge number of discernment -devices. These devices are consistently and consequently gathering data, -which prompt a quick development of information scale. -- Different of observation gadgets: They inspect varied resources and -heterogeneity of the IoT data. The gathered data from distinctive devices -and measures have different semantics and structures. -- Interoperability: It indicates the way that the vast majority of the IoT -applications are currently secluded. In the long run, the IoT will need to -accomplish data distribution to encourage communitarian among diverse -applications. Taking telemedicine benefit as an instance, once a patient is -in crisis, the movement data is likewise expected to evaluate the landing -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -4 -interval of the rescue vehicle to choose what sort of assistant medical -strategy to take. -- Multi-dimensionality: It is considered as the principal issue in the -applications of the IoT. For the most part, it incorporates a few sensors to -all the while screen various pointers, such as temperature, dampness, light, -and weight. Along these lines, the specimen information is typically -multidimensional [1]. -Data are extensive in volume, so they are asserted in a mixed bag or moved -with such speed, which are called "Big data." It is not a thing; it is a thought or -ideal model that characterized the expanding, gathering, and utilization of huge -measures of dissimilar information. Big data is helping in choice making and -taking the business to a different universe [3]. -Big data started to be the point of view when the standard database frameworks -were not prepared to handle the unstructured data, such as weblogs, features, -photographs, social overhauls, and human conduct. They are produced by online -networking, sensor devices, or from some other data creating sources. -Figure 2. The Big data 4Vs and data sequence. -Figure 2 observes the big data 4Vs that includes volume, velocity, variety, and -veracity. It also describes the big data sequence. Some issues and technologies -are identified with the accessibility of greatly substantial volumes of data that -organizations need to join and get. There is a significant venture for a time, cash, -and assets that are expected to make this style of processing ordinary. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -5 -The rest of this paper is structured as follows. Section 2 shows some basic -concepts. Section 3 explains the current related work. Section 4 shows the -proposed system and explains each phase in detail. In Section 5, the -implementation results of the proposed techniques are discussed on a benchmark -dataset. Finally, the conclusion and future work are presented in Section 6. -2. BASIC CONCEPTS -2.1 MAPREDUCE -The Big data analytics society has admitted MapReduce as a programming -template for handling massive data on separated systems. MapReduce model -has become one of the perfect choices of the programming paradigm for -processing massive datasets. It is a paradigm for evolving a distributed -clarification for complicated difficulties over enormous datasets [4]. Users -identify a map function that handles a pair of key-value to produce a group of -intermediate key-value sets. In addition, It creates a reduce function that joins -all intermediate values related with the same intermediate key. The MapReduce -architecture is shown in Figure 3. -Figure 3. The MapReduce architecture. - MapReduce Algorithm -There are four steps to implement MapReduce framework, which includes reading a -large dataset, implementing the map function, implementing the reduce function, and -returning the resulting data from the map and reduce. The mapper receives masses of -data and produces intermediate results. The reducer reads the intermediate results and -emits a final result. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -6 -A. Read Large Dataset -Figure 4. The block diagram of MapReduce read data. -As shown in figure 4, we create a data store using the dataset with CSV extension. The -data store displays a tabular datastore object for the data. Then, we select specific -variables' names from the dataset. The selected variables' names feature permits -working with the specified variables of the user's needs. The user can use preview -command to retrieve the data. -B. Generic Map Function -Figure 5 shows the generic map function, which is considered as a general function for -any key and value. This function enables the coder to set any pair of key-value for the -selected dataset. We set intermediate key and intermediate value. Then, we subset the -dataset at this specific value. Finally, we obtain set of key-value stored in the keyvalue -store. -Insert large dataset -Initialize datastore variable to -store large dataset. -Select specific variables' names -from the dataset. -Add selected variables to -datastore. -Preview large dataset. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -7 -Figure 5. The block diagram of generic map function. -C. Map Function -Data Key-value -store -Subset -term -Generic map function -Initialize intermediate key and intermediate -value -Subset data at specific value -Set intermediate key-value store -Adding set of intermediate keys and -intermediate values to intermediate keyvalue -store. -Data Intermediate -key-value -store -Map function receives data and specific value -Set the condition of key-value pair -Create output store to store all partitions of data that -satisfy the key-value condition. -Store all the results in output key-value store. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -8 -Figure 6. The block diagram of map function. -Figure 6 illustrates the Map function that gets a table with the variables labeled by -the selected variables' names property in the data store. Then, the Map function -extracts a subset of the dataset that verifies the condition value of the selected key. -D. Reduce Function -Figure7 shows the Reduce function that receives the subsetted results gained from -the Map function and simply merge them into a single table. The Reduce returns one -key and one value. -Figure7. The block diagram of reduce function. -2.2 DBSCAN ALGORITHM -DBSCAN [5] is a clustering technique that depends on density. The thought is that if a -specific point fits in with a cluster, it ought to be close to loads of different points in -that cluster. The DBSCAN algorithm works as follows. First, two parameters are -picked, a positive number Epsilon and a characteristic number minPoints. Then, start -by picking a subjective point in the dataset. If there are more than minPoints points -inside of a separation of Epsilon starting there, we consider every one of them to be a -piece of a "cluster." Then, we extend that cluster by checking the greater part of the -new points and checking whether they too have more than minPoints points inside of a -Intermediate -value -Key-value -store -Create Reduce function -Initialize output value variable. -Get all intermediate results -While has next results, add intermediate values to the output value. -Adding all output values to output key-value store. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -9 -separation of epsilon. In the end, points are come out to be added to the cluster. After -that, pick a new arbitrary point and repeat the process. Presently, it is entirely probable -that the picked point is less than minPoints points in its Epsilon, and it is not a part of -any other group. Therefore, it is viewed as a "noise point" that is not fitting in any -group. The DBSCAN pseudo code is listed as follows: -DBSCAN steps are as follows: -1. Design a graph whose items are the points to be clustered -2. For each core-point c make an edge from c to every point p in the - -neighborhood of c -3. Set N to the items of the graph; -4. If N does not include any core points terminate -5. Pick a core point c in N -6. Let X be the set of nodes that can be reached from c by going forward; -1. create a cluster containing X{c} -2. N=N/(X{c}) -7. Continue with step 4 -2.3 FCM ALGORITHM -FCM [6,7] is a data clustering procedure. The dataset is categorized to n clusters. -Every data point in the dataset related to a cluster, which has a high level of -relationship with that cluster. Another data point that remotely lies from the center of a -cluster has a low level of association with that cluster. This technique is often utilized -in pattern recognition. It depends on minimization of the objective function. The -algorithmic steps for Fuzzy C-Means clustering is as follows: -First, calculate the center of the clusters using the following equation [6]: -Cj = ΣN -i=1 (Mm -ij * xi )/ Mm ij (1) -Then, the objective function is calculated based on the membership matrix by the -following calculation: -Jm=ΣN -i=1 ΣC -j=1 Mm -ij ||xi – cj ||2 (2) -Finally, the membership value is updated by: -M ij= 1/(Σ(||xi – cj|| / || xi - ck||))2/(m-1) (3) -where m is a real number greater than 1, Mij is the degree of membership of xi in the -cluster j, xi is the ith of d-dimensional measured data, cj is the d-dimension center of the -cluster, and ||*|| is the similarity measure between any measured data and the center. -FCM sequentially moves the cluster centers to the right area inside a dataset. FCM -clustering strategies rely on fuzzy behavior, and they give a method that is normal to -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -10 -produce a clustering where membership weights have a characteristic translation but -not probabilistic at all. -2.4 K- NEAREST NEIGHBORS (KNN) -In KNN regression, the outcome is the estimation of the item. This value is normal of -the estimations of its k-closest neighbors. Hence testing the performance should be -appropriate. The KNN computes the Euclidean distance from the query example to the -labeled examples using the following equation [8]. -D=√ (4) -Selecting the ideal value for K is best done by first reviewing the data. A large K value -is more accurate as it reduces the overall noise. Then, labeled examples are ordered by -the highest distance and find a heuristically top K-number of adjacent neighbors. -Finally, search the data for the most likely instance. It does not lose any detail and -compares every training sample to give the prediction. -2.5 SINGULAR VALUE DECOMPOSITION -SVD receives a rectangular matrix of the data that are defined as A, where A is an n x -p matrix, which the n rows represents the data and the p columns represent the -experimental properties. The SVD theorem states that [9]: -Anxp= Unxn Snxp VT -pxp (5) -Where -UTU = Inxn (6) -VTV = Ipxp (i.e. U and V are orthogonal) (7) -where U has columns that are the left singular vectors, S is the same dimensions as A -that contains singular values, and VT has rows that are the right singular vectors. The -SVD represents an outline of the original data in a coordinate system where the matrix -is diagonal. The SVD calculated by the equation: -W = AAT (8) -Wx=ƛx (9) -The scalar  is called an eigenvalue of A, and x is an eigenvector of A relating to . -The computation of the SVD consists of finding the eigenvalues and eigenvectors of -AAT or ATA. The eigenvectors of ATA consist the columns of V, the eigenvectors of -AAT represent the columns of U. Also, the singular values in S are square roots of -eigenvalues from AAT or ATA. The singular values are the diagonal entries of the S -matrix and are arranged in descending order. The singular values are always real -numbers. If the matrix A is a real matrix, then U and V are also real. The SVD feature -specifies the nearest rank-l estimation for a matrix. By putting the little singular values -to zero, we can acquire matrix estimations whose rank meets the number of -outstanding singular values. -3. RELATED WORK -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -11 -Big data from IoT is considered as an important research topic. Many researchers are -working in this field. For example, Tao and Ji [10] utilized the MapReduce technique to -investigate the various little datasets. They proposed procedure for monstrous little data -in light of the K-means clustering calculation. Their outcomes established that the -suggested manner may enhance the information preparing proficiency. They use Kmeans -calculation for data examination in view of MapReduce. Then, they utilized a -record for the converging of data inside the cluster. The data in the same square have a -high likeness when the merger is finished. The exploration will help them to plan a -merger technique of little information in IoT. The DBSCAN algorithm can be suitable -to be applied on big data because the number of clusters is not needed to be known in -the beginning. -Xu and Xun [11] outlined MapReduce model of distributed computing. In the -instrument of MapReduce, they consolidated the structural planning attributes and key -innovation of IoT. They led conveyed mining on information and data in the IoT world. -Also, they represent stream information distribution. In a customary way for mining -valuable information from raw data created by IoT, analyze deficiencies of the -conventional Apriori calculation. Apriori has a lower mining proficiency and consumes -up mass room in memory. The mining technique for monstrous information in the IoT -involves stream information investigation, grouping and so on. They plan to propose a -system for handling Big data with a low charge and apply security of information. The -proposed system has a low effectiveness, so it should be moved forward. -Wang et al. [12] investigated structural planning of the IoT in agribusiness that gives -distributed processing and its usage. The execution planned on a two-tier construction -using HBase. The structural planning gives constant read or access to the enormous -sensor information. In addition, backing the sensor data executed by MapReduce -model. XML documents put standards for the framework to bind the organizations of -heterogeneous sensor data. Utilizing this framework lead the framework to lack of a -variety of sensor data. -Gole and Tidk [13] proposed a ClustBigFIM method, which is based on MapReduce -structure for mining large datasets. ClustBigFIM is an improvement of BigFIM -algorithm that is offering velocity to obtain information from massive datasets. They -are relying on the manner of associations, sequential patterns, correlations, and other -data mining missions that give good vision. MapReduce stage is utilized widely for -mining big data from online networking as convention device and systems. It aims to -employ frequent item to set mining calculation and MapReduce system on a flow of -information. It can be consistent experiences in Big data. -Li et al. [1] suggested a storage managing clarification relied on NoSQL, which is -called IOTMDB. They offered a storing managing clarification to handle the massive -and heterogeneous IoT data. The IOTMDB is not only mattered about how to save the -massive IoT data successfully but also to concern for data distribution. The IoT data -storing tactics are applied to incorporate a preprocessing procedure to cover the public -and precise requirements. Their future work will be a model oriented to IOTMDB that -will rely on NoSQL. In addition, they will handle and investigate the massive IoT data -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -12 -to expand its value. Applying a reduction algorithm in the preprocessing step will -improve the accuracy and save time. -Mesiti and Valtolina [14] proposed a structure that was ready to assemble information -from distinctive sources with diverse structures like JSON, XML, literary, and -information gushing from sensors. The information accumulations led the database to -be unstructured and oblige information joining. As the world moves to grow Big -information investigation strategies, they came to an answer that can be loading -information from heterogeneous sensors then incorporate that heterogeneous sensor -information utilizing NoSQL frameworks. They outlined an easy to use loading -framework by deciding an arrangement to choose fitting NoSQL framework that -permits reasonable mapping to be conveyed. -Zhan et al. [15] designed a massive data processing model in the cloud. Their model -can be used to handle all types of data resources, which can be structured, semistructured, -and non-structured. They concentrated on two main points. First, they -outlined the CloudFS that depends on the open sources project Hadoop. Second, they -implemented Cloud Manager DB that is constructed on the open sources project HBase, -MongoDB. Finally, they did not provide any method to deal with the varieties of the -data. -Galache et al. [16] displayed the ClouT extend, which is a joint European-Japanese -venture. Their main issue is making nations mindful of city resources. In addition, they -talk care of these resources by a set of smart IoT services in the Cloud. The proposed -framework based on a three-layer architecture, which is composed of CIaaS, CPaaS, -and CSaaS layers. They developed different four use cases associated with different -applications within four cities. These assets utilized and considered by effective IoT -benefits in the Cloud. -Sowe et al. [17] proposed an answer for massive heterogeneous sensor information -issue. They obliged to make a join between distinctive types of information. This issue -is an incorporated IoT structural planning. It consolidates a Service-Controlled -Networking (SCN) as a key middleware to oversee heterogeneous information -accumulated from sensors on a Big data cloud stage. The proposed model is connected -to accumulate, share information, and control IoT social orders. It allows the client to -investigate, find, and use the sensor information. They utilized the User Defined -Harvester (UDH) advancements notwithstanding SCN to expand the included detection. -In this paper, the portable detecting information is not accessible. They ought to execute -the structure that can treat with this detection information. -Cecchinel et al. [18] proposed a programming structure that ready to support big data -examination work. This structure is the utilization measure of datasets that originate -from physical sensors. These datasets originate from SMARTCAMPUS venture. Their -structural engineering can understand genuine prerequisites from the -SMARTCAMPUS venture. As a result, the work done in this structural planning relies -on information from social event and capacity, i.e. the discriminating way of Big data -accumulation stage by utilizing middleware structural engineering. They plan to create -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -13 -a programming model that empowers every client to make its applications. On the off -chance that they include new information, the system programming can be down. -Therefore, the system adaptability ought to be improved. -Mishra et al. [19] proposed a Cognitive-Oriented IoT Big-data framework (COIB) for -the valuable data administration and knowledge detection over the IoT Big data. They -built a general IoT Big data layered design by the usage of the COIB system in the huge -scale mechanical computerization environment. They suggested in their future work to -incorporate mining and examination huge information that is produced by trillions of -IoT items -In this paper, our proposed system offers a solution for storing and retrieving IoT -Big data and improves the accuracy of the resulting data. The proposed system can -store and retrieve a massive number of data in small time. First, we clean noise from -data. Then, we use Kennard sample and SVD as a data reduction techniques to reduce -big data from IoT without losing any data. Also, we use the mutual information -algorithm to detect relationships between attributes and predict the semantic clusters. -Finally, we use MapReduce based on FCM-DBSCAN for data clustering for the vast -store and retrieve of data. -4. THE PROPOSED SYSTEM -Figure 8. The proposed system of the massive–heterogeneous sensor data. -Variety of sensors -Raw data -Data Cleaning -Data Integration -Data Processing (clustering) -Storage -Data Reduction -Storage -Homogenous -data -Dimensional -reduced data -Data with little -size -Cleaned, -noiseless data -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -14 -The proposed system consists of two main phases: data preprocessing and data -processing phases, as shown in figure 8 In the preprocessing phase, the first stage is -data collection stage. In this stage, the dataset collected from different sensors. The -second stage is data cleaning based on outlier detection and noise removal as it is easy -to implement. The third stage is data reduction using SVD algorithm to reduce the -dimensionality of the data and reduce the execution time of data processing then -utilization of Kennard sampling to select a random sample from dataset aiming to save -the running time. The last stage is data integration based on correlation and mutual -information aiming to determine the relationship between attributes and detect semantic -clusters. In the processing phase, data is clustered using FCM-DBSCAN based on -MapReduce as it is a standard programming model for data distribution to improve the -performance of big data in a vast time. In the following subsections, the main stages of -these two phases will be discussed in details. -a. Data Preprocessing Phase: -The preprocessing is a basic phase in data science because it qualifies the choices to be -originated from the qualified data. Data preprocessing is a data mining method that -includes changing raw data into a reasonable information. Genuine information is -frequently inadequate, conflicting, leaking in specific practices, and liable to include -numerous mistakes. Data preprocessing is a demonstrated strategy for determining such -issues. It utilizes database-driven applications, such as associations and standard -established applications [20]. The applied data preprocessing steps are data cleaning, -data reduction, and data integration. These steps are discussed in detail in the -following subsections. -a) Data Cleaning: -The procedure of cleaning the data is not easy. The confusion may reach to more -than 30% of real information that could be grimy. In addition, it has exceptionally cost -[21]. Data can be cleaned based on procedures, such as filling in missing values, -smoothing the noisy data, or solving the inconsistencies in the data. Several ways have -been used to deal with missing data, such as [22]: - Deletion: It removes the missing data and using the rest of the data in the -analysis. This deletion can be inefficient as it decreases dataset size and may -delete valuable data. - Imputation: It tries to fill in the missing values with the help of many techniques, -such as: -o Mean/Mode: It fills the missing data by using the mean of a numeric -attribute or mode for a nominal attribute of all data. -o K-Nearest Neighbor Imputation (KNN): It uses KNN algorithms to fill -the missing data. It can deal with discrete and continuous attributes. KNN -searches all the data to find the most similar instances. It can choose the -most probable value from the dataset. -We suggest KNN algorithm for data cleaning. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -15 -Figure 9. The block diagram of the data cleaning steps. -Figure 9 shows that the input data has many challenges as noisy data and outliers. -First, the data is de-duplicated to remove the repetition of the data. Then, the outlier is -detected and excluded from data. The data is filtered to choose the principle attributes -to represent data. Finally, missing values are replaced by the most probable value -depending on KNN regression. -b) Data Reduction -A monstrous measure of information is progressively available from different -sources, for example, logistics insights, cameras, receivers, RFIDs, scanner tag -information, remote sensor systems, and account logs from R&D [23]. Highdimensional -information gets extraordinary difficulties terms of computational manysided -quality and characterization execution. Along these lines, it is important to -obtain a low-dimensional component space from high dimensional component space to -outline a learner with great execution [24]. -Figure 10. The block diagram for the data reduction steps. -Figure 10 shows that the cleaned data is the input for data reduction stage. Data -reduction separated to numericity reduction and dimensionality reduction. The data -numericity reduction can be applied using regression or sampling. The used sampling -algorithm is Kennard sample. Kennard sample reduces the number of iterations by -viewing a list of the highest smallest distances that aims to save time. The data -dimensionality reduction can be applied using many algorithms as PCA, SOM, and -SVD algorithm. We proposed to use SVD for dimensionality reduction. It is suitable -for reducing the dimensionality of large dimensional data. We compare SVD -Input data De-duplication Detect outlier -Replace missing values Filtering -Input data -(Cleaned and Noiseless data) -Numericity Reduction -Dimensionality Reduction -Sampling -Singular Value Decomposition -Reduced data -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -16 -algorithm with other algorithms as PCA, SOM, ICA, PCA (Kernel). We conclude that -the SVD algorithm operates in small time than other algorithms. -c) Data Integration -Data with diverse forms are put together and opposes with each other. Data integration -is a successful way to deal with combined data that lives in various sources and gives -brought together for the purpose of access to the end users [25]. Big data presents -organizations with huge volume and multifaceted nature. It comes in diverse -structures–organized, semi-organized, and unorganized– and from any number of -different sources. From this immense, various sorts of information are continuously -developed. Therefore, the organizations should concentrate on speedy, exact, and -significant bits of knowledge [26]. -The proposed algorithm is the mutual information that can able to deal with numeric -data. Mutual information detects the relationship between the attributes and also -detects the semantic clusters. The equation of the mutual information is as follows -[27]: -MI=Σx,y P(X,Y)log2[P(X,Y)/P(X)P(Y)] (10) -where X and Y are the two dimensions of the dataset. -b. Data Processing Phase -Data processing phase is the control of the information processing. Information -preparation refers to the handling of the information that is needed to run associations -[28]. Massive data from IoT require processing for data storing. Huge IoT information -is the high inspecting recurrence, this result in a tremendous measure of repeating or -amazingly comparative qualities. We suggest MapReduce based on a hybrid of FCM -and DBSCAN as a clustering algorithm to overcome the massive data storing problem. -MapReduce is considered as the most suitable technique to apply massive data -processing. -In FCM-DBSCAN Map function, first, we initialize minimum points that represent -minimum value of points in each cluster, epsilon value that represent the distance -between center and point, and membership matrix. Then, we calculate the centers of -clusters using equation -for each point in the dataset, the distance between -points and center of the cluster is calculated using equation d=ΣN -i=1 ΣC -j=1 Mm -ij ||xi – cj -||2. If the distance between point and center of cluster equal or greater than epsilon -value, this point marked as neighborPts to this cluster. Then, the neighbors points for -each center are calculated depending on epsilon value. If neighbor points for any -cluster are less than minimum points, then mark point as a noise else, the point marked -as clustered. We determine the key and create a new cluster. It repeats until reach to -convergence state. Finally, emit each point and each belonging cluster. - FCM-DBSCAN Map Function -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -17 -FCM-DBSCAN Map function -FCM-DBSCAN(D, eps, MinPts, M) -Initialize a number of clusters. -For each point P in dataset D -if P is visited -continue next point -mark P as visited -Calculate the center of clusters by equation -Calculate distance using equation d=ΣN -i=1 ΣC -j=1 Mm -ij ||xi – cj ||2. -For each p in D -Calculate neighborPts for each c based on eps -If d<= eps mark p as neighborPt -if sizeof(NeighborPts) < MinPts -mark P as NOISE -else Prepare the key and create new cluster C. -C = next cluster -expandCluster(P, C) -C.neighborPoints = NeighborPts -For each c -Calculate the new value of membership by equation M ij= 1/(Σ(||xi – cj|| / ||xi - ck||))2/(m-1) -Calculate the center of clusters by equation -if new center=old center then Break -End for -Emit(key, c) -End for -End for -End function - FCM-DBSCAN Reduce Function -FCM-DBSCAN Reduce function -FCM-DBSCAN Reduce function (key, c, eps, MinPts) -For all C clusters do -Set finalC.Points equal finalC.points ∪ C.points -For all P in C.neighborPoints do -if P′ is not visited -mark P′ as visited -Calculate NeighborPts′ for each P′ based on eps -If size of NeighborPts′ >= MinPts -set NeighborPts equal NeighborPts ∪ NeighborPts′ -End if -If P′ is not yet a member of any cluster -add P′ to cluster C -End if -End for -End for -Output: Set of clusters of data. -In FCM-DBSCAN Reduce function, the inputs are minimum points, epsilon value, -clusters, and keys. For each C cluster, the final cluster points equal to previous cluster -points in addition to the current cluster points. For all points in the cluster if a point -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -18 -marked as unvisited, then this point is marked as visited. The neighbor points are -calculated and compared with minimum points. If neighbor points are greater or equal -to a minimum point, then neighbor points are equal to neighbor points and cluster -points. Finally, the output is a set of a cluster of data. -As shown in figure 8, raw data is collected from different sensors, which results in -many problems, such as noisy, heterogeneous and massive data. Our proposed work -aim to solve these problems that face sensor data. The raw data is collected from -different sensors and stored. Then we applied preprocessing on this data. Then this -data is cleaned from noise by regression using KNN. We suggest KNN for dealing -with noisy data as it is very simple and can detect the most probable value than another -technique. Then, the cleared data reduced using SVD algorithm. It is very suitable for -reducing the high-dimensional data and for validating significant vision of data. -Therefore, data is sampled using Kennard sample. When applying the sampling, it -speeds the running time. We integrate the data come from heterogeneous sources -based on correlation, covariance matrices using mutual information matrix to detect -the relationship between elements in the dataset and predict semantic clusters. -In the data processing step, the proposed model is the MapReduce model based on -FCM-DBSCAN clustering technique. It is an intensity established clustering -algorithm, which gives an arrangement of entities in some space. It can discover the -clusters of diverse forms and sizes from a huge quantity of data without detecting -some clusters in the beginning. -5. THE EXPERIMENTAL RESULTS AND DISCUSSION -5.1 DATASET DESCRIPTION -The dataset includes ordinary IADL housekeeping activities [29]. These activities are -vacuuming, ironing, dusting, brooming, mopping, cleaning windows, making the bed, -watering plants, washing dishes, and setting the table. The general interval of the -dataset is 240 minutes. The intervals differ amongst some of the activities, indicating -the usual spreading of activities in daily life. They used the Porcupine sensor together -with the iBracelet to record both acceleration and RFID tag detections. The dataset -consists of the estimation of 1048576 records. We implement the proposed technique -on the dataset using Radoop, KNIME, and Matlab 2015b on Core(TM) 2 Due, 2 GH -processor, and 3 GB RAM. -5.2 RESULTS VIEW -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -19 -Figure 11. A part of the used dataset. -Figure 11 shows a part of the used dataset. The act represents activity label results -from ironing, vacuuming, brooming, making the bed, mopping, window cleaning, -watering plant, dish washing, and setting the table. Acc represent 3D Accelerator [x, y, -z] represented in Acc1, Acc2, Acc3, Lgt represent light, Tlt represent nine tilt data, Btn -represent annotation buttons, Rtc represent real time clock [ddmmyyhhmmss], and -Time represents elapsed number of seconds from the beginning of the recording. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -20 -Figure 12. The outlier detection. -Figure 12 shows the outlier detection. A new field called outlier appears. In the state of -finding an outlier, the value of this field is true otherwise the value is false. The outlier -is true when an observation is well outside of the expected scope of values in an -experiment. An outlier arises from variability in the measurement or experimental -error indication. The outliers are excluded from the dataset. -Figure 13. The outlier excluding and replacing the missing values. -Figure 13 shows that the outlier property has the values false for all the tuples, and the -missing values are replaced by the most probable value depending on KNN regression. -Figure14. The SVD deployment. -Figure 14 shows the applying of SVD algorithm that results in the reduction of the -dataset. The data represented using a smaller number of properties. The attribute with -high singular value has the priority to be presented. SVD1 has the highest probability -to present the data. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -21 -Figure 15. The mutual information matrix. -Figure 15 shows the outcome matrix from mutual information. A measure of the -variables mutual dependence is the trans-information of two variables. The mutual -information represents the rate of association or correlation between the row and -column variables. The mutual information partitioned data by 2N where N is the -sample size. The mutual information between items is used as a feature for clustering -to discover semantic clusters. When the value of mutual information is large, it -represents a high relationship between attributes. -5.3 RESULT VIEW OF MAPREDUCE PROCESSING. -Figure 16. The resulting attributes from Read dataset code. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -22 -Figure 17. The MapReduce function execution and read the resulted data after -MapReduce implementation. -Figure 16 shows the read data from MapReduce that observes a set of resulted -attributes view from IADL dataset after data preprocessing phase. Figure 17 shows the -MapReduce implementation. The dataset begins with no Map and no Reduce (Map is -0%, and Reduce is 0%) until Map becomes 100% and Reduce becomes 100%. Then, -we read the data result from MapReduce implementation. -5.4 EVALUATION -The evaluation observes the time and accuracy of preprocessing of the dataset. As -shown in Tables 2 and 3, the precision value is 99.3%, sensitivity value is 99.53%, and -the value of specificity is 85.52%. From the previous results and evaluation, we -conclude that the reduction step and FCM-DBSCAN enhanced the accuracy of the Big -data to be 98.9%. -Accuracy = -(11) -Precision = -(12) -Sensitivity (TP rate) = -(13) -Specificity (TN rate) = -(14) -TP True Positives: positive tuples correctly labeled -FP False Positives: negative tuples incorrectly labeled -TN True Negatives: negative tuples correctly labeled -FN False Negatives: positive tuples incorrectly labeled -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -23 -Table 1: The comparison between KM, Optics, EM, DBSCAN, and our proposed -system FCM-DBSCAN based on MapReduce model. -PCA PCA(Kernel) ICA SOM SVD -Time -(Sec.) -Accuracy -(%) -Time -(Sec.) -Accuracy -(%) -Time -(Sec.) -Accuracy -(%) -Time -(Sec.) -Accuracy -(%) -Time -(Sec.) -Accuracy -(%) -Performance -measure -k-Means 92 0.2 0.89 2 87 0.2 92.15 0.1 94.73 0.2 -0.79 -Optics 90.2 1.9 91 9.68 65.48 0.6 91.05 1.9 90 -EM 65.82 13 75.28 2.17 94.4 2.54 66.64 8 95.21 2 -DBSCAN 93.4 0.3 89.3 7.3 90.12 0.4 88.46 1 98 3.11 -FCM- 94.5 0.25 91.6 5.2 97.48 0.5 93.5 2.3 98.9 1.5 -DBSCAN -Table 1shows the comparison between different clustering algorithms as K-Means, -Optics, EM, DBSCAN, and the proposed approach FCM-DBSCAN. The clustering -algorithms are tested with different data reduction algorithms, such as PCA, -PCA(Kernel), ICA, SOM, and SVD. -In table 2 and table 3 we divided the dataset to training data and testing data, then we -evaluate the proposed approach on the tested data. -Table 2: The Positive and Negative matrix for the proposed system. -Predicted True False -Actual -Yes 9500 44 -No 66 390 -Table 3: The performance measure of our proposed system. -Recall 99.53% -Precision 99.3% -Sensitivity 99.53% -Specificity 85.52% -Accuracy 98.9% -F-measure 99.39% -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -24 -Time in -seconds -Clustering Techniques -Figure 18. The expended time comparison between different clustering technique -based on different reduction algorithms based on MapReduce model. -From our comparative studies that done in Table 1 and figure 18, we found that FCMDBSCAN -with its varied approaches for data reduction had the high accuracy value. -FCM-DBSCAN with SVD have the highest value of accuracy and retrieve data in a -small time. -K-Means and optics have nearest accuracy value, but optics has longer time. The EM -algorithm takes larger time than other techniques. The DBSCAN has high accuracy but -takes longer time. In FCM-DBSCAN, the accuracy increased and the expected time -decreased. -6.CONCLUSION -A massive amount of IoT data has been generated due to the vast increasing of -existing devices, sensors, actuators, and network communications. The resulting -massive IoT data is called "Big data." Big data refers to a massive data, which takes -much time to be processed. Therefore, we focused on clustering methodology rely on -MapReduce model to store data and recover results in a close real-time. We offer a -framework for processing massive and heterogeneous data in IoT. -This paper illustrated the Big data from IoT from many viewpoints. The raw dataset is -collected from different sensors, which leads to many problems, such as noisy, -heterogeneous, and massive data. Our proposed system aims to solve these problems -that face sensor data. The architecture of the proposed system consists of two main -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -K-Means Optics EM DBSCAN FCM-DBSCAN -PCA -PCA kernel -ICA -SOM -SVD -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -25 -stages: data preprocessing and data processing phases. In the preprocessing phase, we -used KNN to clean noisy data and replace missing data, which can use the most -probable value. The SVD is used to reduce data to save time. The mutual information -is implemented to detect the relationship between the data and detect semantic -clustering to achieve high accuracy and speed the running time. The MapReduce -model based on FCM-DBSCAN achieves data clustering by Map and Reduce -functions in a small time that resulted from using reduction technique before data -clustering. The processing time of the proposed system is 1.5 seconds, and the -accuracy is 98.9%. -In future work, we will implement the processing on different datasets and apply -different techniques using spark model that aims to speed the running time. Moreover, -we will implement data query processing using the best and suitable model of NoSQL -database. We suggest Key-value database. The Key-value (KV) stores use the -associative array, which is called a map. This approach can efficiently retrieve -selective key ranges. Also, we will address the challenges and deeply develop the big -data processing in cloud computing environments. -7. REFERENCES -[1] Li, T., Liu, Y., Tian, Y., Shen,S., & Mao, W. (2012). A storage solution for -massive IoT data based on NoSQL. IEEE International Conference on Green -Computing and Communications (GreenCom), Besancon, 50-57. -[2] Tsai, C., Lai, C., Chiang, M., & Yang, L. (2014). Data Mining for Internet of -Things: A Survey. IEEE Communications Surveys & Tutorials, 16(1), 77-97. -[3] Sharma, S. & Mangat, V. (2015). Technology and trends to handle big data: a -survey. 5th IEEE International Conference on Advanced Computing & -Communication Technologies (ACCT), Haryana, 266-271. -[4] Martha, V. S., Zhao, W., & Xu, X. (2013). h-MapReduce: a framework for -workload balancing in MapReduce. 27th IEEE International Conference on -Advanced Information Networking and Applications, 637-644. -[5] Dharni, C. & Bnasal, M. (2013). An Improvement of DBSCAN Algorithm to -Analyze Cluster for Large Datasets. IEEE International Conference on MOOC, -Innovation and Technology in Education (MITE), 42-46. -[6] Ghosh, S. & Kumar, S. (2013). Comparative Analysis of K-Means and Fuzzy CMeans -Algorithms. International Journal Of Advanced Computer Science And -Applications, 4(4), 35-39. -[7] Bora, D. & Gupta, D. (2014). A Comparative study Between Fuzzy Clustering -Algorithm and Hard Clustering Algorithm. International Journal Of Computer -Trends And Technology, 10(2), 108-113. -[8] Han, J., Kamber, M., & Pei, J. (2012). Data Mining Concepts and Techniques. -Third Edition, Elsevier, Chapter 9, 422- 425. -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -26 -[9] Singular Value Decomposition (SVD) tutorial. (2016). Web.mit.edu. Retrieved 7 -Jan 2016, from -http://web.mit.edu/be.400/www/SVD/Singular_Value_Decomposition.htm -[10] Tao, X. & Ji, C. (2014). Clustering massive small data for IOT. 2nd -International Conference on Systems and Informatics( ICSAI), Shanghai, 974- -978. -[11] Liancheng, X. & Jiao, X. (2014). Research on distributed data stream mining in -Internet of Things. International Conference On Logistics Engineering -Management And Computer Science (LEMCS), Atlantis Press, 149- 154. -[12] Wang, H., Lin, G., Wang, J., Gao, W., Chen, Y., & Duan, Q. (2014). -Management of Big Data in the Internet of Things in Agriculture Based on Cloud -Computing. AMM, 548-549, 1438-1444. -[13] Gole, S. & Tidke,B. (2015). Frequent itemset mining for big data in social media -using ClustBigFIM algorithm. IEEE International Conference on Pervasive -Computing (ICPC), Pune, 1-6. -[14] Mesiti, M.& Valtolina, S. (2014). Towards a user-friendly loading system for -the analysis of big data in The Internet Of Things. 38Th IEEE Annual -International Computers, Software, And Applications Conference Workshops -(COMPSACW), Vasteras, 312- 317. -[15] Zhang, G., Li,C., Zhang, Y., Xing, C., & Yang, J. (2012). An efficient massive -data processing model in the Cloud- A preliminary report. 7th ChinaGrid Annual -Conference, Beijing, 148-155. -[16] Galache, J., Yonezawa, T., Gurgen, L., Pavia, D., Grella, M., & Maeomichi, H. -(2014). ClouT: leveraging cloud computing techniques for improving -management of massive IoT data. 7th IEEE International Conference on Service- -Oriented Computing and Applications(SOCA), Matsue, 24-327. -[17] Sowe, S., Kimata, T., Dong, M., & Zettsu, K. (2014). Managing heterogeneous -sensor data on a big data platform: IoT services for data-intensive science. 38Th -IEEE Annual International Computers, Software, And Applications Conference -Workshops, Vasteras, 259-300. -[18] Cecchinel, C., Jimenez, M., Mosser, S., & Riveill, M. (2014). An architecture to -support the collection of big data in The Internet Of Things. 10Th IEEE World -Congress On Services, Anchorage, AK, 442-449. -[19] Mishra, N., Lin, C., & Chang, H. (2014). A cognitive-oriented framework for -IoT big-data management prospective. IEEE International Conference -Communication Problem-Solving (ICCP), Beijing, 124-127. -[20] What is Data Preprocessing? - Definition from Techopedia. (2015). Techopedia. -com. Retrieved 9 July 2015, from -http://www.techopedia.com/definition/14650/data-preprocessing -ISSN:0254-0223 Vol. 31 (n. 7, 2016) -27 -[21] Tang, N. (2015). Big RDF data cleaning. 31st IEEE International Conference -on Data Engineering Workshops (ICDEW), Seoul, 77-79 . -[22] Shoaip, N., Elmogy, M., Riad, A., & Badria, F. (2015). Missing Data Treatment -Using Interval-valued Fuzzy Rough Sets with SVM. International Journal of -Advancements in Computing Technology(IJACT), 7(5), 37-48. -[23] Sadeghzadeh, K. & Fard, N. (2015). Nonparametric data reduction approach for -large-scale survival data analysis. IEEE Reliability and Maintainability -Symposium (RAMS), Palm Harbor, 1 – 6. -[24] Katole, S. & Karmore, S. (2015). A new approach of microarray data dimension -reduction for medical applications. 2nd IEEE International Conference on -Electronics and Communication Systems (ICECS), Coimbatore, 409-413. -[25] Saranya, K., Hema, M., & Chandramathi, S. (2014). Data fusion in ontology -based data integration. IEEE International Conference on Information -Communication and Embedded Systems (ICICES), Chennai, Tamil Nadu, India, -1-6. -[26] Pal, K. (2015). How to Address Common Big Data Pain Points. Data Informed. -Retrieved 8 July 2015, from http://data-informed.com/how-to-address-commonbig- -data-pain-points -[27] Cover, T. & Thomas, J. (2012). Elements of information theory. Second Edition, -John Wiley & Sons, Chapter 2, 19-22 . -[28] Encyclopedia Britannica: data processing | computer science. (2015). -Encyclopedia Britannica. Retrieved 7 July 2015, from -http://www.britannica.com/technology/data-processing -[29] ADL Recognition Based on the Combination of RFID and Accelerometer -Sensing | Embedded Sensing Systems - www.ess.tu-darmstadt.de. (2015). Ess.tudarmstadt. -de. Retrieved 17 August 2015, from http://www.ess.tudarmstadt. -de/datasets/PHealth08-ADL -View publication stats \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls deleted file mode 100644 index 0a536999d8e50ea8d27a89590e46ce5681b6b4b0..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls and /dev/null differ diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" deleted file mode 100644 index f54af0f02683226c9ba822d66779242a90cd04d3..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" +++ /dev/null @@ -1,214 +0,0 @@ -apache hadoop architecture – hdf&Explore&依赖 -architecture&Hadoop&AGGREGATION -we&Hadoop Architecture&依赖 -we&article&依赖 -article&Hadoop architecture&依赖 -Hadoop architecture&architecture&GENERALIZATION -component&Hadoop architecture&AGGREGATION -we&Hadoop architecture&依赖 -we&Hadoop architecture&依赖 -we&detail&依赖 -we&Hadoop Architecture diagram&依赖 -inexpensive , reliable , and scalable framework&big datum&依赖 -goal&inexpensive , reliable , and scalable framework&依赖 -large dataset&sizes and format&AGGREGATION -master-slave architecture&architecture&GENERALIZATION -Hadoop&datum&依赖 -vast amount&datum&AGGREGATION -Hadoop&master-slave architecture&依赖 -Hadoop&vast amount&依赖 -master node&task&依赖 -master node&slave node&依赖 -Slave node&actual business datum&依赖 -Hadoop architecture&three layer&依赖 -Management&component& -hdf and YARN&Hadoop Framework&依赖 -core component&Hadoop Framework&AGGREGATION -us&three core component&依赖 -It&Hadoop&依赖 -file&block-size chunk&依赖 -block&slave node&依赖 -block&cluster&依赖 -we&requirement&依赖 -block size&128 mb&依赖 -block size&128 mb&依赖 -we&which&依赖 -block size&default&依赖 -our&requirements& -block size&default&依赖 -HDFS&master-slave architecture&依赖 -HDFS&Hadoop&依赖 -It&NameNode and DataNode&依赖 -It&two daemon&依赖 -master node&node&GENERALIZATION -NameNode NameNode store&block&依赖 -NameNode NameNode store&names , information&依赖 -NameNode NameNode store&file&依赖 -NameNode NameNode store&block&依赖 -block&file&AGGREGATION -NameNode NameNode store&names , information&依赖 -NameNode NameNode store&file&依赖 -It&Datanodes&依赖 -slave node&actual business datum&依赖 -It&client read/write request&依赖 -namenode store&metada&依赖 -namenode store&metada&依赖 -datanodes store&file&依赖 -datanodes store&block&依赖 -namenode store&block location&依赖 -namenode store&block location&依赖 -It&Hadoop&依赖 -data processing layer&Hadoop&AGGREGATION -application&vast amount&依赖 -application&vast amount&依赖 -application&petabyte&依赖 -application&datum&依赖 -application&terabyte&依赖 -application&datum&依赖 -application&petabyte&依赖 -application&datum&依赖 -application&terabyte&依赖 -application&terabyte&依赖 -application&petabyte&依赖 -application&terabyte&依赖 -cluster&commodity hardware&AGGREGATION -application&datum&依赖 -application&petabyte&依赖 -application&vast amount&依赖 -application&vast amount&依赖 -MapReduce framework&framework&GENERALIZATION -MapReduce framework&< key , value > pair&依赖 -MapReduce job&work&依赖 -unit&work&AGGREGATION -MapReduce job&job&GENERALIZATION -MapReduce job&input datum&依赖 -MapReduce job&MapReduce program&依赖 -Hadoop&MapReduce job&依赖 -two type&task&AGGREGATION -Hadoop YARN&task&依赖 -Hadoop YARN&YARN&GENERALIZATION -they&unfavorable condition&依赖 -user&map function&依赖 -map function&function&GENERALIZATION -function&map task&AGGREGATION -output&map task&AGGREGATION -output&reduce task&依赖 -output&reduce task&依赖 -map task&task&GENERALIZATION -Reduce task&map task&依赖 -Reduce task&output&依赖 -Reduce task&aggregation&依赖 -MapReduce task&two phase&依赖 -MapReduce task&task&GENERALIZATION -Hadoop&input&依赖 -Hadoop&input&依赖 -Hadoop&fixed-size split&依赖 -RecordReader&record&依赖 -it&records itself&依赖 -RecordReader&split&依赖 -one map task&a user-defined function call map function&依赖 -Hadoop&map phase&依赖 -one map task&input split&依赖 -Hadoop&one map task&依赖 -input split&split&GENERALIZATION -one map task&record&依赖 -It&zero or multiple intermediate key-value pair&依赖 -It&map task output&依赖 -map task&output&依赖 -map task&local disk&依赖 -its&output& -Hadoop&combiner function&依赖 -combiner function&function&GENERALIZATION -Hadoop&user&依赖 -combiner group&map phase&依赖 -combiner group&datum&依赖 -combiner group&map phase&依赖 -combiner group&datum&依赖 -output&map function&AGGREGATION -It&map function&依赖 -It&output&依赖 -their&output& -map task partition&output&依赖 -their&values& -Hadoop&user&依赖 -Hadoop&partitioning&依赖 -Reducer task&a shuffle and sort step&依赖 -Reducer task&task&GENERALIZATION -main purpose&phase&AGGREGATION -main purpose&equivalent key&依赖 -sort and shuffle phase download&datum&依赖 -It&data piece&依赖 -It&large data list&依赖 -MapReduce framework&sort&依赖 -we&it&依赖 -sort and shuffling&framework&依赖 -developer&control&依赖 -developer&control&依赖 -Reducer&key grouping&依赖 -it&zero or more key-value pair&依赖 -it&OutputFormat&依赖 -Hadoop HDFS&HDFS&GENERALIZATION -reduce task output&Hadoop HDFS&依赖 -It&reducer output&依赖 -reducer output&output&GENERALIZATION -it&default&依赖 -it&key&依赖 -YARN YARN&YARN&GENERALIZATION -YARN YARN&Resource Negotiator&依赖 -resource management layer&Hadoop&AGGREGATION -It&Hadoop 2&依赖 -YARN&separate daemon&依赖 -YARN&functionality&依赖 -YARN&job scheduling&依赖 -YARN&idea&依赖 -job scheduling&scheduling&GENERALIZATION -functionality&job scheduling&AGGREGATION -basic idea&global ResourceManager and application Master&依赖 -application&job&依赖 -single job or DAG&job&AGGREGATION -basic idea&application&依赖 -YARN&ResourceManager and NodeManager&依赖 -apache hadoop yarn 1&apache hadoop yarn 1&依赖 -It&resource&依赖 -It&cluster&依赖 -It&application&依赖 -It&two main component&依赖 -Scheduler&resource&依赖 -Scheduler&running&依赖 -Scheduler&capacities , queues , etc&依赖 -Scheduler&resource&依赖 -It&application&依赖 -It&status&依赖 -Scheduler&restart&依赖 -Scheduler&failed task&依赖 -restart&failed task&AGGREGATION -resource requirement&application&AGGREGATION -It&scheduling&依赖 -ApplicationManager&first container&依赖 -their&status& -It&status and progress&依赖 -It&machine resource usage&依赖 -It&nodemanager (&依赖 -we&article&依赖 -we&Hadoop Architecture&依赖 -Hadoop&master-slave topology&依赖 -architecture&three layer&依赖 -hdf&Hadoop&依赖 -Hadoop cluster&cluster&GENERALIZATION -hdf daemon namenode and yarn daemon resourcemanager&Hadoop cluster&依赖 -hdf daemon namenode and yarn daemon resourcemanager&master node&依赖 -hdf daemon datanode&hdf daemon datanode&依赖 -hdf daemon datanode&hdf daemon datanode&依赖 -hdf daemon datanode&slave node&依赖 -hdf daemon datanode&slave node&依赖 -hdf daemon datanode&slave node&依赖 -hdf daemon datanode&slave node&依赖 -hdf daemon datanode&hdf daemon datanode&依赖 -hdf daemon datanode&hdf daemon datanode&依赖 -hdf and mapreduce framework run&same set&依赖 -hdf and mapreduce framework run&same set&依赖 -hdf and mapreduce framework run&node&依赖 -same set&node&AGGREGATION -hdf and mapreduce framework run&same set&依赖 -hdf and mapreduce framework run&node&依赖 -hdf and mapreduce framework run&node&依赖 diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" deleted file mode 100644 index 2cc94323d42dfbe3723c7bf8d460d5f12d03f6b2..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" +++ /dev/null @@ -1,152 +0,0 @@ -Apache Hadoop Architecture – HDFS, YARN & MapReduce -Explore the architecture of Hadoop, which is the most adopted framework for storing and processing massive data. - -In this article, we will study Hadoop Architecture. The article explains the Hadoop architecture and the components of Hadoop architecture that are HDFS, MapReduce, and YARN. In the article, we will explore the Hadoop architecture in detail, along with the Hadoop Architecture diagram. - -Let us now start with Hadoop Architecture. - -Hadoop Architecture -hadoop architecture - -The goal of designing Hadoop is to develop an inexpensive, reliable, and scalable framework that stores and analyzes the rising big data. - -Apache Hadoop is a software framework designed by Apache Software Foundation for storing and processing large datasets of varying sizes and formats. - -Hadoop follows the master-slave architecture for effectively storing and processing vast amounts of data. The master nodes assign tasks to the slave nodes. - -The slave nodes are responsible for storing the actual data and performing the actual computation/processing. The master nodes are responsible for storing the metadata and managing the resources across the cluster. - -Slave nodes store the actual business data, whereas the master stores the metadata. - -The Hadoop architecture comprises three layers. They are: - -Storage layer (HDFS) -Resource Management layer (YARN) -Processing layer (MapReduce) -hadoop component - hadoop architecture - -The HDFS, YARN, and MapReduce are the core components of the Hadoop Framework. - -Let us now study these three core components in detail. - -1. HDFS -HDFS Architecture - -HDFS is the Hadoop Distributed File System, which runs on inexpensive commodity hardware. It is the storage layer for Hadoop. The files in HDFS are broken into block-size chunks called data blocks. - -These blocks are then stored on the slave nodes in the cluster. The block size is 128 MB by default, which we can configure as per our requirements. - -Like Hadoop, HDFS also follows the master-slave architecture. It comprises two daemons- NameNode and DataNode. The NameNode is the master daemon that runs on the master node. The DataNodes are the slave daemon that runs on the slave nodes. - -NameNode -NameNode stores the filesystem metadata, that is, files names, information about blocks of a file, blocks locations, permissions, etc. It manages the Datanodes. - -DataNode -DataNodes are the slave nodes that store the actual business data. It serves the client read/write requests based on the NameNode instructions. - -DataNodes stores the blocks of the files, and NameNode stores the metadata like block locations, permission, etc. - -2. MapReduce -apache hadoop mapreduce - -It is the data processing layer of Hadoop. It is a software framework for writing applications that process vast amounts of data (terabytes to petabytes in range) in parallel on the cluster of commodity hardware. - -The MapReduce framework works on the pairs. - -The MapReduce job is the unit of work the client wants to perform. MapReduce job mainly consists of the input data, the MapReduce program, and the configuration information. Hadoop runs the MapReduce jobs by dividing them into two types of tasks that are map tasks and reduce tasks. The Hadoop YARN scheduled these tasks and are run on the nodes in the cluster. - -Due to some unfavorable conditions, if the tasks fail, they will automatically get rescheduled on a different node. - -The user defines the map function and the reduce function for performing the MapReduce job. - -The input to the map function and output from the reduce function is the key, value pair. - -The function of the map tasks is to load, parse, filter, and transform the data. The output of the map task is the input to the reduce task. Reduce task then performs grouping and aggregation on the output of the map task. - -The MapReduce task is done in two phases- - -1. Map phase -a. RecordReader - -Hadoop divides the inputs to the MapReduce job into the fixed-size splits called input splits or splits. The RecordReader transforms these splits into records and parses the data into records but it does not parse the records itself. RecordReader provides the data to the mapper function in key-value pairs. - -b. Map - -In the map phase, Hadoop creates one map task which runs a user-defined function called map function for each record in the input split. It generates zero or multiple intermediate key-value pairs as map task output. - -The map task writes its output to the local disk. This intermediate output is then processed by the reduce tasks which run a user-defined reduce function to produce the final output. Once the job gets completed, the map output is flushed out. - -c. Combiner - -Input to the single reduce task is the output from all the Mappers that is output from all map tasks. Hadoop allows the user to define a combiner function that runs on the map output. - -Combiner groups the data in the map phase before passing it to Reducer. It combines the output of the map function which is then passed as an input to the reduce function. - -d. Partitioner - -When there are multiple reducers then the map tasks partition their output, each creating one partition for each reduce task. In each partition, there can be many keys and their associated values but the records for any given key are all in a single partition. - -Hadoop allows users to control the partitioning by specifying a user-defined partitioning function. Generally, there is a default Partitioner that buckets the keys using the hash function. - -2. Reduce phase: -The various phases in reduce task are as follows: - -a. Sort and Shuffle: - -The Reducer task starts with a shuffle and sort step. The main purpose of this phase is to collect the equivalent keys together. Sort and Shuffle phase downloads the data which is written by the partitioner to the node where Reducer is running. - -It sorts each data piece into a large data list. The MapReduce framework performs this sort and shuffles so that we can iterate over it easily in the reduce task. - -The sort and shuffling are performed by the framework automatically. The developer through the comparator object can have control over how the keys get sorted and grouped. - -b. Reduce: - -The Reducer which is the user-defined reduce function performs once per key grouping. The reducer filters, aggregates, and combines data in several different ways. Once the reduce task is completed, it gives zero or more key-value pairs to the OutputFormat. The reduce task output is stored in Hadoop HDFS. - -c. OutputFormat - -It takes the reducer output and writes it to the HDFS file by RecordWriter. By default, it separates key, value by a tab and each record by a newline character. - -hadoop mapreduce - hadoop architecture - -3. YARN -YARN stands for Yet Another Resource Negotiator. It is the resource management layer of Hadoop. It was introduced in Hadoop 2. - -YARN is designed with the idea of splitting up the functionalities of job scheduling and resource management into separate daemons. The basic idea is to have a global ResourceManager and application Master per application where the application can be a single job or DAG of jobs. - -YARN consists of ResourceManager, NodeManager, and per-application ApplicationMaster. - -apache hadoop yarn -1. ResourceManager -It arbitrates resources amongst all the applications in the cluster. - -It has two main components that are Scheduler and the ApplicationManager. - -a. Scheduler - -The Scheduler allocates resources to the various applications running in the cluster, considering the capacities, queues, etc. -It is a pure Scheduler. It does not monitor or track the status of the application. -Scheduler does not guarantee the restart of the failed tasks that are failed either due to application failure or hardware failure. -It performs scheduling based on the resource requirements of the applications. -b. ApplicationManager - -They are responsible for accepting the job submissions. -ApplicationManager negotiates the first container for executing application-specific ApplicationMaster. -They provide service for restarting the ApplicationMaster container on failure. -The per-application ApplicationMaster is responsible for negotiating containers from the Scheduler. It tracks and monitors their status and progress. -2. NodeManager: -NodeManager runs on the slave nodes. It is responsible for containers, monitoring the machine resource usage that is CPU, memory, disk, network usage, and reporting the same to the ResourceManager or Scheduler. - -3. ApplicationMaster: -The per-application ApplicationMaster is a framework-specific library. It is responsible for negotiating resources from the ResourceManager. It works with the NodeManager(s) for executing and monitoring the tasks. - -Summary -In this article, we have studied Hadoop Architecture. The Hadoop follows master-slave topology. The master nodes assign tasks to the slave nodes. The architecture comprises three layers that are HDFS, YARN, and MapReduce. - -HDFS is the distributed file system in Hadoop for storing big data. MapReduce is the processing framework for processing vast data in the Hadoop cluster in a distributed manner. YARN is responsible for managing the resources amongst applications in the cluster. - -The HDFS daemon NameNode and YARN daemon ResourceManager run on the master node in the Hadoop cluster. The HDFS daemon DataNode and the YARN NodeManager run on the slave nodes. - -HDFS and MapReduce framework run on the same set of nodes, which result in very high aggregate bandwidth across the cluster. - -Keep Learning!! \ No newline at end of file diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" deleted file mode 100644 index 5a7cac9932dabcd8a0bb4b7d70da7b279c2db650..0000000000000000000000000000000000000000 Binary files "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt deleted file mode 100644 index 5c09e91e0f5051ce7614a158d476e6bcd3db513a..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt +++ /dev/null @@ -1,646 +0,0 @@ -PROFILE Sanchita Paul Birla Institute&Technology , Mesra 49 publication&AGGREGATION -https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 -content&Puneet Singh Duggal&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 -content&10 december 2016&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 -Diabetes Diagnosis View project Prediction&thunderstorm and lightning&AGGREGATION -https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 -https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 -author&publication&AGGREGATION -soft computing and datum mining view project puneet singh duggal birla institute&Technology&AGGREGATION -user&downloaded file&依赖 -user&enhancement&依赖 -enhancement&downloaded file&AGGREGATION -Technology , Mesra , Ranchi , India duggal@gmail.com Sanchita Paul Department&Computer Science & Engineering Birla Institute&AGGREGATION -We&on-demand , on-command Digital universe&依赖 -Computer Science & Engineering Birla Institute&Technology , Mesra , Ranchi , India duggal@gmail.com Sanchita Paul Department&AGGREGATION -Computer Science & Engineering Birla Institute&Technology Mesra , Ranchi , India sanchita07@gmail.com Abstract —&AGGREGATION -challenge and solutions puneet singh duggal department&Computer Science & Engineering Birla Institute&AGGREGATION -its&Volume& -datum&" Big Data "&依赖 -Most&datum&AGGREGATION -it&nature&依赖 -heterogeneity&datum&AGGREGATION -volume&Big Data&依赖 -Traditional data management , warehousing and analysis system&datum&依赖 -specific nature&Big Data&AGGREGATION -it&specific nature&依赖 -it&Big Data&依赖 -its&nature& -it&large distributed file system&依赖 -Map Reduce&efficient analysis&依赖 -efficient analysis&Big Data&AGGREGATION -Map Reduce&Big Data&依赖 -Traditional DBMS technique&Big Data&依赖 -Traditional DBMS technique&classification and clustering&依赖 -classification and clustering&Big Data&AGGREGATION -author&various method&依赖 -author&catering&依赖 -author&various method&依赖 -author&catering&依赖 -use&file indexing&AGGREGATION -Minimization technique&use&依赖 -Minimization technique&file indexing&依赖 -Minimization technique&technique&GENERALIZATION -Map Reduce technique&paper&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&click stream datum&依赖 -Keyword-Big Data Analysis&everything&依赖 -structured ( traditional dataset&DBMS table&依赖 -Big Data&e-mail attachment&依赖 -structured ( traditional dataset&DBMS table&依赖 -structured ( traditional dataset&rows and column&依赖 -Big Data&datum&依赖 -structured ( traditional dataset&rows and column&依赖 -Big Data&structured ( traditional dataset&依赖 -heterogeneous mix&datum&AGGREGATION -80 percent&enterprise datum&AGGREGATION -whose size&typical database software tool&依赖 -whose&size& -“ Big data ”&dataset&依赖 -ability&typical database software tool&AGGREGATION -big datum analyticsis&area&依赖 -advanced analytic technique&big data set&依赖 -two&most profound trend&依赖 -two&one&依赖 -one&most profound trend ( bus ) [ 4 ]&AGGREGATION -it&heterogeneity , velocity and volume&依赖 -it&Big Data&依赖 -it&1 ] [ 2 ]&依赖 -it&traditional data analysis and management tool&依赖 -heterogeneity , velocity and volume&Big Data&AGGREGATION -problem&NoSQL&依赖 -problem&NoSQL&依赖 -it&transaction processing&依赖 -analysis&Big Data&AGGREGATION -it&Parallel&依赖 -Map Reduce&[ 12 ]&依赖 -its&architecture& -shared-nothing&commodity diverse hardware ( big cluster )&依赖 -Map Reduce&characteristic&依赖 -function&high-level programming language&依赖 -Its&functions& -Hive tool&tool&GENERALIZATION -Query processing&NoSQL&依赖 -Hive tool&[ 20 ]&依赖 -what&possible solution&依赖 -more business opportunity&affinity&依赖 -more business opportunity&affinity&依赖 -best suppliers , associate product&affinity&AGGREGATION -more business opportunity&sale seasonality [ 25 ] etc&依赖 -more business opportunity&best suppliers , associate product&依赖 -more business opportunity&best suppliers , associate product&依赖 -more business opportunity&sale seasonality [ 25 ] etc&依赖 -advanced form&analytics [ 6 ]&AGGREGATION -Traditional experience online analytic processing ( olap )&analytics [ 6 ]&依赖 -Traditional experience online analytic processing ( olap )&advanced form&依赖 -Organizations&specific form&实现 -Organizations&analytic&实现 -specific form&analytic&AGGREGATION -collection&related techniques and tool type&AGGREGATION -user&new business fact&依赖 -user&knew&依赖 -large volume&datum&AGGREGATION -analyst&detail&依赖 -analyst&large volume&依赖 -plenty&detail&AGGREGATION -analyst&plenty&依赖 -enterprise&analytics example&依赖 -enterprise&log datum&依赖 -analyst&datum&依赖 -subset&customer base&AGGREGATION -analyst&historic datum&依赖 -analyst&data warehouse&依赖 -company&new form&依赖 -other product&BI&AGGREGATION -company&customer behavioural change&依赖 -new form&customer behavioural change&AGGREGATION -discovery&metric , report , analytic model&依赖 -company&customer behavioural change&依赖 -company&new form&依赖 -different type&analytic tool&AGGREGATION -unique challenge&special processing system&依赖 -unique challenge&special processing system&依赖 -Map Reduce&[&依赖 -Map Reduce&[&依赖 -analysis&technique&依赖 -distributed file system architecture&original Google File System [ 13 ]&依赖 -Map Reduce job&efficient data processingtechnique&依赖 -Mapping , Combining , Shuffling , Indexing , Grouping and reduce&[ 7 ]&依赖 -phase&MapReduce&AGGREGATION -technique&Map Reduce task&实现 -technique&paper&依赖 -technique&implementation&依赖 -a result need&index&依赖 -a result need&index&依赖 -impact&world-wide Web&AGGREGATION -major issue&world-wide Web&依赖 -major issue&impact&依赖 -major issue&impact&依赖 -major issue&world-wide Web&依赖 -its&content& -Database technology&task&依赖 -company&topology&依赖 -company&information&依赖 -company&Web and users ‟ search history&依赖 -topology&Web and users ‟ search history&AGGREGATION -turn&further challenge&依赖 -turn&millennium&AGGREGATION -google ‟&challenge&实现 -challenge&web-scale datum management and analysis&AGGREGATION -google ‟&web-scale datum management and analysis&实现 -challenge&Web-scale storage&AGGREGATION -whose content&machine&依赖 -hundred&machine&AGGREGATION -whose&content& -whose content&machine&依赖 -it&large file&依赖 -whose content&machine&依赖 -programming model&model&GENERALIZATION -Google&Map Reduce programming model and platform [ 1 ] [ 13 ]&依赖 -its&model& -sort&partitioned parallelism&AGGREGATION -Map Reduce framework&datum&依赖 -Map Reduce framework&a common key (&依赖 -large collection&datum&AGGREGATION -Map Reduce framework&large collection&依赖 -group&instance&AGGREGATION -Facebook&suit&依赖 -its&system& -Hadoop system&traction&依赖 -it&use case include web indexing&依赖 -set&higher-level declarative language&AGGREGATION -Hadoop community&set&依赖 -Hadoop community&higher-level declarative language&依赖 -low-level nature&Map Reduce programming model&AGGREGATION -Popular language&Yahoo!&依赖 -Popular language&Pig&依赖 -Jaql&ibm [ 28 ]&依赖 -Jaql&[ 18 ]&依赖 -Pig&nature&依赖 -60 %&Yahoo!&AGGREGATION -90 %&Facebook Map Reduce use case&AGGREGATION -[ 27 ]&Dryad&依赖 -[ 27 ]&cover&依赖 -[ 27 ]&Dryad&依赖 -Microsoft&Hadoop [ 24 ]&依赖 -Microsoft&support&依赖 -its&strategy& -HADOOP AND HDFS Hadoop&data storage and processing&依赖 -it&hdf&依赖 -It&commodity hardware&依赖 -It&MapReduce&依赖 -It&distributed data processing&依赖 -[ 17 ] [ 19 ]&found&依赖 -software architecture&aHadoop stack&AGGREGATION -[ 17 ] [ 19 ]&layer&依赖 -file&byte&依赖 -( very large ) contiguous and randomly addressable sequence&byte&AGGREGATION -file&distributed file system&依赖 -hdf&Hadoop software stack&依赖 -hdf&bottom&依赖 -hdf&bottom&依赖 -hdf&Hadoop software stack&依赖 -file&( very large ) contiguous and randomly addressable sequence&依赖 -bottom&Hadoop software stack&AGGREGATION -HDFS file&file&GENERALIZATION -middle layer&batch analytic&依赖 -middle layer&batch analytic&依赖 -Hadoop Map Reduce system&HDFS file&依赖 -middle layer&stack&AGGREGATION -map phase&job&AGGREGATION -Hadoop Map Reduce system&map operation&依赖 -group&output data item&AGGREGATION -Hadoop Map Reduce system&map operation&依赖 -partition&HDFS file and sort&AGGREGATION -Hadoop Map Reduce system&partition&依赖 -hbase store (&basic key-based record management operation&依赖 -hbase store (&Hadoop stack&依赖 -hbase store (&key-value layer&依赖 -hbase store (&application&依赖 -hbase store (&application&依赖 -hbase store (&key-value layer&依赖 -hbase store (&Hadoop stack&依赖 -hbase store (&basic key-based record management operation&依赖 -top&hdf )&AGGREGATION -contents&HBase&AGGREGATION -Many user&declarative language&依赖 -MapReduce programming model&programming model&GENERALIZATION -Many user&bare MapReduce programming model&依赖 -Many user&Hadoop stack&AGGREGATION -Many user&use&依赖 -use&declarative language&AGGREGATION -High-level language compiler null&Hadoop software stack&依赖 -High-level language compiler null&Hadoop software stack&依赖 -High-level language compiler null&such client&依赖 -High-level language compiler null&such client&依赖 -HDFS Clusters Figure2&traditional experience&依赖 -collection&related technique&AGGREGATION -HDFS Clusters Figure2&relevancy&依赖 -HDFS Clusters Figure2&traditional experience&依赖 -HDFS Clusters Figure2&relevancy&依赖 -Figure 3&Hadoop&实现 -Figure 3&architecture&依赖 -architecture&HDFS clusters implementation&AGGREGATION -Figure 3&HDFS clusters implementation&依赖 -hdf&task&依赖 -Data analysis task&cluster&依赖 -BIG DATA ANALYSIS Heterogeneity&progress&依赖 -BIG DATA ANALYSIS Heterogeneity&progress&依赖 -BIG DATA ANALYSIS Heterogeneity&process&依赖 -phase&datum&依赖 -BIG DATA ANALYSIS Heterogeneity&process&依赖 -BIG DATA ANALYSIS Heterogeneity&phase&依赖 -phase&value&依赖 -BIG DATA ANALYSIS Heterogeneity&phase&依赖 -phase&process&AGGREGATION -much datum today&structured format&依赖 -images and video&storage and display&依赖 -piece&text&AGGREGATION -major creator&value&AGGREGATION -value&datum&AGGREGATION -most datum&digital format today&依赖 -we&opportunity&依赖 -scalability&algorithm&AGGREGATION -Big Data analysis&many application&依赖 -complexity&datum&AGGREGATION -lack&algorithm&AGGREGATION -lack&scalability&AGGREGATION -most&statistician&依赖 -its&interpretation& -presentation&result&AGGREGATION -most&BI related job&AGGREGATION -Figure 4&big data analysis tool&依赖 -glimpse&big data analysis tool&AGGREGATION -data storage part&HDFS distributed file system architecture&依赖 -other mention architecture&amazon web service ( aws ) [ 23 ] , hbase and cloudstore etc&依赖 -HDFS distributed file system architecture&distributed file system architecture&GENERALIZATION -part&hadoop and hdfs framework&AGGREGATION -velocity and heterogeneity&datum&AGGREGATION -volume and veracity&datum&AGGREGATION -layer&bedrock&依赖 -layer&Big Data management and analysis framework&依赖 -layer&Big Data management and analysis framework&依赖 -layer&bedrock&依赖 -their&tools& -MapReduce programming model&map ( )&依赖 -MapReduce programming model&two function and map ( )&依赖 -their&logic& -user&own processing logic&实现 -list&intermediate key/value pair&AGGREGATION -map ( ) function&input key/value pair&依赖 -mapreduce runtime system group&mapreduce runtime system group&依赖 -mapreduce runtime system group&mapreduce runtime system group&依赖 -signature&map ( )&AGGREGATION -one master node&slave node&依赖 -list ( v2 )&master-slave architecture&依赖 -list ( v2 )&master-slave architecture&依赖 -number&slave node&AGGREGATION -one master node&number&依赖 -one master node&19 ]&依赖 -Hadoop&MapReduce job&依赖 -MapReduce job&job&GENERALIZATION -data block&one TaskTracker node&依赖 -TaskTracker node&JobTracker&依赖 -scheduler&new task&依赖 -scheduler&it&依赖 -it&data block&依赖 -scheduler&data locality&依赖 -scheduler&account&依赖 -Map Reduce Architecture&local data block&依赖 -Map Reduce Architecture&TaskTracker&依赖 -scheduler&TaskTracker&依赖 -scheduler&rack-local or random data block&依赖 -runtime system group&reduce task&依赖 -set&reduce task&AGGREGATION -runtime system group&set&依赖 -hundreds or thousand&processor&AGGREGATION -scalability and i&heterogeneous and large dataset&依赖 -scalability and i&inbuilt process&依赖 -inbuilt process&heterogeneous and large dataset&AGGREGATION -scalability and i&status and monitoring&依赖 -status and monitoring&heterogeneous and large dataset&AGGREGATION -scalability and i&status and monitoring&依赖 -scalability and i&heterogeneous and large dataset&依赖 -scalability and i&inbuilt process&依赖 -Node –&file&依赖 -Node –&HDFS metada&依赖 -Node –&doesn ‟ t deal&依赖 -Data Node – stores block&HDFS – default replication level&AGGREGATION -job tracker – schedule&job tracker – schedule&依赖 -Task Tracker –&Mapper and Reducer interface&实现 -core&job&AGGREGATION -1 ) mapper mapper&input key/value pair&依赖 -1 ) mapper mapper&intermediate key/value pair&依赖 -1 ) mapper mapper&set&依赖 -set&intermediate key/value pair&AGGREGATION -individual task&input record&依赖 -individual task&intermediate record&依赖 -zero or many output pair&19 ]&依赖 -block&input file&AGGREGATION -number&map&AGGREGATION -total number&block&AGGREGATION -number&input&依赖 -number&total size&依赖 -total size&input&AGGREGATION -right level¶llelism&AGGREGATION -map&execute&依赖 -map&minute&依赖 -10TB&input datum&AGGREGATION -you&input datum&依赖 -blocksize&128MB&AGGREGATION -you&10TB&依赖 -with 82,000 map&17 ] [ 19 ]&依赖 -smaller set&value&AGGREGATION -2 ) reducer reducer&intermediate value&依赖 -intermediate value&key&依赖 -intermediate value&value&依赖 -intermediate value&smaller set&依赖 -set&intermediate value&AGGREGATION -2 ) reducer reducer&set&依赖 -Reducer&3 primary phase&依赖 -Reducer&shuffle&依赖 -2.1 ) shuffle input&mapper&依赖 -2.1 ) shuffle input&mapper&依赖 -sorted output&mapper&AGGREGATION -framework&HTTP&依赖 -framework&relevant partition&依赖 -framework&HTTP&依赖 -output&mapper&AGGREGATION -framework&output&依赖 -relevant partition&output&AGGREGATION -framework&relevant partition&依赖 -framework&output&依赖 -framework group&key&依赖 -framework group&have&依赖 -framework group&reducer input&依赖 -one&a comparator ( secondary sort )&依赖 -( list&value&AGGREGATION -grouped inputs.The output&reduce task&AGGREGATION -application&Reporter&依赖 -output&Reducer&AGGREGATION -right number&reduce&AGGREGATION -better job&load balancing [ MR Framework ]&AGGREGATION -their&round& -faster node&reduce&依赖 -faster node&reduce&依赖 -faster node&first round&依赖 -faster node&first round&依赖 -first round&reduce&AGGREGATION -number&reduce&AGGREGATION -cost&failure&AGGREGATION -scaling factor&a few reduce slot&依赖 -scaling factor&speculative-task&依赖 -It&number&依赖 -number&reduce-task&AGGREGATION -It&reduce-task&依赖 -a ) partitioner partitioner partition&key space&依赖 -Partitioner&key&依赖 -key&intermediate map-output&AGGREGATION -Partitioner&intermediate map-output&依赖 -partitioning&key&AGGREGATION -Partitioner&partitioning&依赖 -subset&key )&AGGREGATION -number&job&依赖 -number&reduce task&依赖 -number&of&AGGREGATION -total number&partition&AGGREGATION -this control&task&依赖 -intermediate key (&for reduction&依赖 -b ) reporter reporter&MapReduce application&依赖 -counters.mapper and reducer implementation&Reporter&依赖 -counters.mapper and reducer implementation&progress&依赖 -application&time&依赖 -significant amount&time&AGGREGATION -application&significant amount&依赖 -framework&task&依赖 -application&counter&依赖 -application&Reporter&依赖 -c ) output collector output collector&facility&依赖 -MapReduce framework&framework&GENERALIZATION -name node –&HDFS metada&依赖 -generalization&facility&AGGREGATION -RGPV 274 output&job )&AGGREGATION -library&useful mapper&AGGREGATION -amount&intermediate datum&AGGREGATION -They&" mini-reducer&依赖 -" mini-reducer&mapper&依赖 -" mini-reducer&output&依赖 -combiner&term& -result&collection&依赖 -result&term&依赖 -result&order&依赖 -result&order&依赖 -result&total number&依赖 -result&collection&依赖 -result&network&依赖 -result&collection&依赖 -number&intermediate key-value pair&AGGREGATION -result&total number&依赖 -result&network&依赖 -result&term&依赖 -result&total number&依赖 -result&term&依赖 -result&collection&依赖 -total number&term&AGGREGATION -result&term&依赖 -result&network&依赖 -order&total number&AGGREGATION -result&order&依赖 -order&number&AGGREGATION -number&unique term&AGGREGATION -result&network&依赖 -result&order&依赖 -result&total number&依赖 -They&result size&依赖 -machine&shuffling cost&依赖 -result size&map function&AGGREGATION -They&map function&依赖 -keyword&technique&依赖 -they&document key&依赖 -keyword&which&依赖 -keyword&document&AGGREGATION -keyword&document key&依赖 -they&which&依赖 -> doc4 :24 shuffling shuffling&IMF , Financial Economics Crisis Doc2&依赖 -index&file&AGGREGATION -their&keys& -> doc4 :24 shuffling shuffling&example Doc1&依赖 -harry potter crisis follow&above data IMF&依赖 -inverted index&above data IMF&AGGREGATION -heterogeneous mix&dataset&AGGREGATION -better chance&accurate result&依赖 -We&population&依赖 -We&generating&依赖 -We&shuffling process&依赖 -process&nature&依赖 -their&purpose& -Cartesian product&datum&AGGREGATION -datum&possible combination&AGGREGATION -its&techniques& -Map Reduce&own Join technique&依赖 -it&Map Reduce&依赖 -it&means&依赖 -iterative work&partitioning&依赖 -iterative work&datum&依赖 -iterative work&datum&依赖 -partitioning&datum&AGGREGATION -iterative work&partitioning&依赖 -data sort&clustering&依赖 -new centre&Step 8&依赖 -new centre&Repeat 1-7&依赖 -their&Step7& -one&k centre&AGGREGATION -new centre&Step 8&依赖 -all datum point¢re&依赖 -Input&k centre&依赖 -new centre&Repeat 1-7&依赖 -new centre&Repeat 1-7&依赖 -new centre&Step 8&依赖 -process enormous quantity&datum&AGGREGATION -dizzying array&source&AGGREGATION -organization&customer&依赖 -competitive advantage&6 ]&依赖 -their&customers& -large and heterogeneous dataset&RGPV 275&依赖 -large and heterogeneous dataset&RGPV 275&依赖 -large and heterogeneous dataset&continuous flow&依赖 -large and heterogeneous dataset&Nov 13-15&依赖 -large and heterogeneous dataset&RGPV 275&依赖 -large and heterogeneous dataset&Nov 13-15&依赖 -engineer&information processing tools and application&依赖 -continuous flow&datum&AGGREGATION -large and heterogeneous dataset&RGPV 275&依赖 -large and heterogeneous dataset&datum&依赖 -large and heterogeneous dataset&Nov 13-15&依赖 -large and heterogeneous dataset&Nov 13-15&依赖 -wide range&task&AGGREGATION -large and heterogeneous dataset&Nov 13-15&依赖 -large and heterogeneous dataset&RGPV 275&依赖 -massive amount&datum&AGGREGATION -mystery&life&AGGREGATION -secret&cosmos&AGGREGATION -variety&problem&AGGREGATION -tool&task&依赖 -single opportunity&map&依赖 -many example&algorithm&AGGREGATION -them&barrier&实现 -single opportunity&map&依赖 -phase&processing )&AGGREGATION -them&map&实现 -existence&shared global state&AGGREGATION -them&mapreduce (&实现 -single opportunity&map&依赖 -model parameter&shared global state&依赖 -model&training datum&依赖 -process&access&依赖 -process&access&依赖 -process&state&依赖 -process&state&依赖 -process&access&依赖 -process&state&依赖 -synchronization&MapReduce framework&依赖 -synchronization&resource&AGGREGATION -update&one or more reducer&依赖 -synchronization&batch learner&依赖 -update&driver code )&依赖 -smaller number&instance&AGGREGATION -design choice&most existing MapReduce implementation&AGGREGATION -faster processing&smaller dataset&AGGREGATION -style&insufficient use&依赖 -style&insufficient use&依赖 -MapReduce&batch operation&依赖 -MapReduce&datum&依赖 -MapReduce&large amount&依赖 -insufficient use&resource&AGGREGATION -large amount&datum&AGGREGATION -style&computation&AGGREGATION -style&resource&依赖 -style&resource&依赖 -ADVANCEMENTS stream&dealing&依赖 -ADVANCEMENTS stream&alternative programming model&依赖 -one or more stream&input&AGGREGATION -its&design& -Pregel [ 16 ]&programming model&实现 -Valiant&model& -Pregel&large-scale graph algorithm&依赖 -Pig [ 28 ]&data analytics platform&依赖 -Pig script&join&依赖 -Pig script&execution engine&依赖 -Pig script&Hadoop job&依赖 -Pig&engine& -open-source project&user&依赖 -open-source project&large relational dataset&依赖 -open-source project&SQL query&依赖 -top&Hadoop&AGGREGATION -advantage&datum processing capability&AGGREGATION -Hadoop&capabilities& -user&abstraction&AGGREGATION -power&MapReduce&AGGREGATION -power&large cluster&AGGREGATION -development&alternative approach&AGGREGATION -MapReduce&Hadoop/HDFS/MapReduceecosystem&依赖 -MapReduce&generalization&依赖 -MapReduce&Hadoop/HDFS/MapReduceecosystem&依赖 -paper&Map Reduce task&依赖 -join processing mention&n&依赖 -join processing mention&n&依赖 -drawback&present system&AGGREGATION -future direction&traditional datum analysis tool&依赖 -future direction&traditional datum analysis tool&依赖 -paradigm&HDFS and Hadoop&AGGREGATION -1 ] jefry dean and MapReduce&1 ] jefry dean and MapReduce&依赖 -A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 -A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 -A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 -A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 -Communications&ACM&AGGREGATION -[ 2 ] jefry dean&[ 2 ] jefry dean&依赖 -Communications&ACM , Volume 51 pp.&AGGREGATION -you&era&依赖 -you&„ big data ‟&依赖 -era&„ big data ‟&AGGREGATION -University&Houston&AGGREGATION -Comparison&Join Algorithms&AGGREGATION -13 ] S. Ghemawat&Google File System&依赖 -[ 16 ] grzegorzmalewicz&pp 135-145&依赖 -[ 16 ] grzegorzmalewicz&pp 135-145&依赖 -[ 16 ] grzegorzmalewicz&pp 135-145&依赖 -[ 16 ] grzegorzmalewicz&pp 135-145&依赖 -[ 16 ] grzegorzmalewicz&pp 135-145&依赖 -/ / www.microsoft.com/windowsazure/features/storage/ [ 25 ] The Age&Big Data&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt deleted file mode 100644 index a36ff99b097e67a735ba4cb66ee5fdc0f9a981d1..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt +++ /dev/null @@ -1,146 +0,0 @@ -See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/311558422 -Big Data Analysis: Challenges and Solutions -Conference Paper · December 2013 -CITATIONS -40 -READS -4,637 -2 authors: -Some of the authors of this publication are also working on these related projects: -Soft Computing Approach for Diabetes Diagnosis View project -Prediction of thunderstorm and lightning using soft computing and data mining View project -Puneet Singh Duggal -Birla Institute of Technology, Mesra -2 PUBLICATIONS 43 CITATIONS -SEE PROFILE -Sanchita Paul -Birla Institute of Technology, Mesra -49 PUBLICATIONS 402 CITATIONS -SEE PROFILE -All content following this page was uploaded by Puneet Singh Duggal on 10 December 2016. -The user has requested enhancement of the downloaded file. -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -269 -Big Data Analysis: Challenges and Solutions Puneet Singh Duggal Department of Computer Science & Engineering Birla Institute of Technology, Mesra, Ranchi, India duggal@gmail.com -Sanchita Paul Department of Computer Science & Engineering Birla Institute of Technology Mesra, Ranchi, India -sanchita07@gmail.com Abstract—We live in on-demand, on-command Digital universe with data prolifering by Institutions, Individuals and Machines at a very high rate. This data is categories as "Big Data" due to its sheer Volume, Variety, Velocity and Veracity. Most of this data is unstructured, quasi structured or semi structured and it is heterogeneous in nature. The volume and the heterogeneity of data with the speed it is generated, makes it difficult for the present computing infrastructure to manage Big Data. Traditional data management, warehousing and analysis systems fall short of tools to analyze this data. Due to its specific nature of Big Data, it is stored in distributed file system architectures. Hadoop and HDFS by Apache is widely used for storing and managing Big Data. Analyzing Big Data is a challenging task as it involves large distributed file systems which should be fault tolerant, flexible and scalable. Map Reduce is widely been used for the efficient analysis of Big Data. Traditional DBMS techniques like Joins and Indexing and other techniques like graph search is used for classification and clustering of Big Data. These techniques are being adopted to be used in Map Reduce. In this research paper the authors suggest various methods for catering to the problems in hand through Map Reduce framework over Hadoop Distributed File System (HDFS). Map Reduce is a Minimization technique which makes use of file indexing with mapping, sorting, shuffling and finally reducing. Map Reduce techniques have been studied at in this paper which is implemented for Big Data analysis using HDFS. Keyword-Big Data Analysis, Big Data Management, Map Reduce, HDFS -I. INTRODUCTION -Big Data encompasses everything from click stream data from the web to genomic and proteomic data from biological research and medicines. Big Data is a heterogeneous mix of data both structured (traditional datasets –in rows and columns like DBMS tables, CSV's and XLS's) and unstructured data like e-mail attachments, manuals, images, PDF documents, medical records such as x-rays, ECG and MRI images, forms, rich media like graphics, video and audio, contacts, forms and documents. Businesses are primarily concerned with managing unstructured data, because over 80 percent of enterprise data is unstructured [26] and require significant storage space and effort to manage.“Big data” refers to datasets whose size is beyond the ability of typical database software tools to capture, store, manage, and analyse [3]. -Big data analyticsis the area where advanced analytic techniques operate on big data sets. It is really about two things, Big data and Analytics and how the two have teamed up to create one of the most profound trends in business intelligence (BI) [4]. Map Reduce by itself is capable for analysing large distributed data sets; but due to the heterogeneity, velocity and volume of Big Data, it is a challenge for traditional data analysis and management tools [1] [2]. A problem with Big Data is that they use NoSQL and has no Data Description Language (DDL) and it supports transaction processing. Also, web-scale data is not universal and it is heterogeneous. For analysis of Big Data, database integration and cleaning is much harder than the traditional mining approaches [4]. Parallel processing and distributed computing is becoming a standard procedure which are nearly non-existent in RDBMS. Map Reduce has following characteristics [12]; it supports Parallel and distributed processing, it is simple and its architecture is shared-nothing which has commodity diverse hardware (big cluster).Its functions are programmed in a high-level programming language (e.g. Java, Python) and it is flexible. Query processing is done through NoSQL integrated in HDFS as Hive tool [20]. Analytics helps to discover what has changed and the possible solutions. Second, advanced analytics is the best way to discover more business opportunities, new customer segments, identify the best suppliers, associate products of affinity, understand sales seasonality[25] etc. Traditional experience in data warehousing, reporting, and online analytic processing (OLAP) is different for advanced forms of analytics [6]. Organizations are implementing specific forms of analytics, particularly called advanced analytics. These are an collection of related techniques and tool types, usually including predictive analytics, data mining, statistical analysis, complex SQL, data visualization, artificial intelligence, natural language processing. Database analytics platforms such as MapReduce, in-database analytics, in-memory databases, and columnar data stores [6] [9] are used for standardizing them. -With big data analytics, the user is trying to discover new business facts that no one in the enterprise knew before, a better term would be “discovery analytics. To do that, the analyst needs large volumes of data with plenty of detail. This is often data that the enterprise has not yet tapped for analytics example, the log data. The analyst might mix that data with historic data from a data warehouse and would discover for example, new change behaviour in a subset of the customer base. The discovery would lead to a metric, report, analytic model, or some other product of BI, through which the company could track and predict the new form of customer behavioural change. -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -270 -Discovery analytics against big data can be enabled by different types of analytic tools, including those based on SQL queries, data mining, statistical analysis, fact clustering, data visualization, natural language processing, text analytics, artificial intelligence etc [4-6]. A unique challenge for researchers system and academicians is that the large datasets needs special processing systems [5]. Map Reduce over HDFS gives Data Scientists [1-2] the techniques through which analysis of Big Data can be done. HDFS is a distributed file system architecture which encompasses the original Google File System [13].Map Reduce jobs use efficient data processingtechniques which can be applied in each of the phases of MapReduce; namely Mapping, Combining, Shuffling,Indexing, Grouping and Reducing [7]. All these techniques have been studied in this paper for implementation in Map Reduce tasks. -II. BIG DATA : OPPORTUNITIES AND CHALLENGES -In the distributed systems world, “Big Data” started to become a major issue in the late 1990‟s due to the impact of the world-wide Web and a resulting need to index and query its rapidly mushrooming content. Database technology (including parallel databases) was considered for the task, but was found to be neither well-suited nor cost-effective [5] for those purposes. The turn of the millennium then brought further challenges as companies began to use information such as the topology of the Web and users‟ search histories in order to provide increasingly useful search results, as well as more effectively-targeted advertising to display alongside and fund those results. Google‟s technical response to the challenges of Web-scale data management and analysis was simple, by database standards, but kicked off what has become the modern “Big Data” revolution in the systems world [3]. To handle the challenge of Web-scale storage, the Google File System (GFS) was created [13]. GFS provides clients with the familiar OS-level byte-stream abstraction, but it does so for extremely large files whose content can span hundreds of machines in shared-nothing clusters created using inexpensive commodity hardware [5]. To handle the challenge of processing the data in such large files, Google pioneered its Map Reduce programming model and platform [1][13]. This model, characterized by some as “parallel programming for dummies”, enabled Google‟s developers to process large collections of data by writing two user-defined functions, map and reduce, that the Map Reduce framework applies to the instances (map) and sorted groups of instances that share a common key (reduce) – similar to the sort of partitioned parallelism utilized in shared-nothing parallel query processing. -Driven by very similar requirements, software developers at Yahoo!, Facebook, and other large Web companies followed suit. Taking Google‟s GFS and Map Reduce papers as rough technical specifications, open-source equivalents were developed, and the Apache Hadoop Map Reduce platform and its underlying file system (HDFS, the Hadoop Distributed File System) were born [1] [12]. The Hadoop system has quickly gained traction, and it is now widely used for use cases including Web indexing, clickstream and log analysis, and certain large-scale information extraction and machine learning tasks. Soon tired of the low-level nature of the Map Reduce programming model, the Hadoop community developed a set of higher-level declarative languages for writing queries and data analysis pipelines that are compiled into Map Reduce jobs and then executed on the Hadoop Map Reduce platform. Popular languages include Pig from Yahoo! [18], Jaql from IBM [28], and Hive from Facebook [18]. Pig is relational-algebra-like in nature, and is reportedly used for over 60% of Yahoo!‟s MapReduce use cases; Hive is SQL-inspired and reported to be used for over 90% of the Facebook Map Reduce use cases. Microsoft‟s technologies include a parallel runtime system called Dryad and two higher-level programming models, Dryad LINQ and the SQLlike SCOPE language [27], which utilizes Dryad under the covers. Interestingly, Microsoft has also recently announced that its future “Big Data” strategy includes support for Hadoop[24]. -III. HADOOP AND HDFS -Hadoop is a scalable, open source, fault-tolerant Virtual Grid operating system architecture for data storage and processing. It runs on commodity hardware, it uses HDFS which is fault-tolerant high-bandwidth clustered storage architecture. It runs MapReduce for distributed data processing and is works with structured and unstructured data. -Figure1Illustrates the layers found in the software architecture of aHadoop stack [17] [19]. At the bottom of the Hadoop software stack is HDFS, a distributed file system in which each file appears as a (very large) contiguous and randomly addressable sequence of bytes. For batch analytics, the middle layer of the stack is the Hadoop Map Reduce system, which applies map operations to the data in partitions of an HDFS file, sorts and redistributes the results based on key values in the output data, and then performs reduce operations on the groups of output data items with matching keys from the map phase of the job. For applications just needing basic key-based record management operations, the HBase store (layered on top of HDFS) is available as a key-value layer in the Hadoop stack. As indicated in the figure, the contents of HBase can either be directly accessed and manipulated by a client application or accessed via Hadoop for analytical needs. Many users of the Hadoop stack prefer the use of a declarative language over the bare MapReduce programming model. High-level language compilers (Pig and Hive) are thus the topmost layer in the Hadoop software stack for such clients. -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -271 -Figure 1.Hadoop Architecture Layers Figure 2.Hadoop Architecture Tools and usage -Figure 3. HDFS Clusters -Figure2 shows the relevancy between the traditional experience in data warehousing, reporting, and online analytic processing (OLAP) and advanced analytics with collection of related techniques like data mining with DBMS, artificial intelligence, machine learning, and database analytics platforms such as MapReduce and Hadoop over HDFS [4] [9]. Figure 3 shows the architecture of HDFS clusters implementation with Hadoop. It can be seen that HDFS has distributed the task over two parallel clusters with one server and two slave nodes each. Data analysis tasks are distributed in these clusters. -IV. BIG DATA ANALYSIS -Heterogeneity, scale, timeliness, complexity, and privacy problems with Big Data hamper the progress at all phases of the process that can create value from data. Much data today is not natively in structured format; for example, tweets and blogs are weakly structured pieces of text, while images and video are structured for storage and display, but not for semantic content and search: transforming such content into a structured format for later analysis is a major challenge [15]. The value of data enhances when it can be linked with other data, thus data integration is a major creator of value. Since most data is directly generated in digital format today, we have the opportunity and the challenge both to influence the creation to facilitate later linkage and to automatically link previously created data. Data analysis, organization, retrieval, and modelling are other foundational challenges [6]. Big Data analysis is a clear bottleneck in many applications, both due to lack of scalability of the underlying algorithms and due to the complexity of the data that needs to be analysed. Finally, presentation of the results and its interpretation by non-technical domain experts is crucial to extracting actionable knowledge as most of the BI related jobs are handled by statisticians and not software experts. -Figure 4, below gives a glimpse of the Big Data analysis tools which are used for efficient and precise data analysis and management jobs. The Big Data Analysis and management setup can be understood through the layered structured defined in the figure. The data storage part is dominated by the HDFS distributed file system architecture; other mentioned architectures available are Amazon Web Service (AWS) [23], Hbase and CloudStore etc. The data processing tasks for all the tools is Map Reduce; we can comfortably say that it is the de-facto Data processing tool used in the Big Data paradigm. -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -272 -Figure 4. Big Data Analysis Tools -For handling the velocity and heterogeneity of data, tools like Hive, Pig and Mahout are used which are parts of Hadoop and HDFS framework. It is interesting to note that for all the tools used, Hadoop over HDFS is the underlying architecture. Oozie and EMR with Flume and Zookeeper are used for handling the volume and veracity of data, which are standard Big Data management tools. The layer with their specified tools forms the bedrock for Big Data management and analysis framework. -V. MAP REDUCE -MapReduce [1-2] is a programming model for processing large-scale datasets in computer clusters. The MapReduce programming model consists of two functions, map() and reduce(). Users can implement their own processing logic by specifying a customized map() and reduce() function. The map() function takes an input key/value pair and produces a list of intermediate key/value pairs. The MapReduce runtime system groups together all intermediate pairs based on the intermediate keys and passes them to reduce() function for producing the final results. Map (in_key, in_value) --->list(out_key,intermediate_value) Reduce (out_key,list(intermediate_value)) -- ->list(out_value) The signatures of map() and reduce() are as follows : map (k1,v1) ! list(k2,v2)and reduce (k2,list(v2)) ! list(v2) -A MapReduce cluster employs a master-slave architecture where one master node manages a number of slave nodes [19]. In the Hadoop, the master node is called JobTracker and the slave node is called TaskTracker as shown in the figure 7. Hadoop launches a MapReduce job by first splitting the input dataset into even-sized data blocks. Each data block is then scheduled to one TaskTracker node and is processed by a map task. The TaskTracker node notifies the JobTracker when it is idle. The scheduler then assigns new tasks to it. The scheduler takes data locality into account when it disseminates data blocks. -Figure 5. Map Reduce Architecture and Working It always tries to assign a local data block to a TaskTracker. If the attempt fails, the scheduler will assign a rack-local or random data block to the TaskTracker instead. When map() functions complete, the runtime system groups all intermediate pairs and launches a set of reduce tasks to produce the final results. Large scale data processing is a difficult task, managing hundreds or thousands of processors and managing parallelization and distributed environments makes is more difficult. Map Reduce provides solution to the mentioned issues, as is supports distributed and parallel I/O scheduling, it is fault tolerant and supports scalability and i has inbuilt processes for status and monitoring of heterogeneous and large datasets as in Big Data [14]. -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -273 -A. Map Reduce Components -1. Name Node – manages HDFS metadata, doesn‟t deal with files directly -2. Data Node – stores blocks of HDFS – default replication level for each block: 3 -3. Job Tracker – schedules, allocates and monitors job execution on slaves – Task Trackers -4. Task Tracker – runs Map Reduce operations -Figure 6.Map Reduce Components -B. Map Reduce Working -We implement the Mapper and Reducer interfaces to provide the map and reduce methods as shown in figure 6. These form the core of the job. -1) Mapper -Mapper maps input key/value pairs to a set of intermediate key/value pairs. Maps are the individual tasks that transform input records into intermediate records. The transformed intermediate records do not need to be of the same type as the input records. A given input pair may map to zero or many output pairs [19]. The number of maps is usually driven by the total size of the inputs, that is, the total number of blocks of the input files. The right level of parallelism for maps seems to be around 10-100 maps per-node, although it has been set up to 300 maps for very cpu-light map tasks. Task setup takes awhile, so it is best if the maps take at least a minute to execute. For Example, if you expect 10TB of input data and have a blocksize of 128MB, you'll end up with 82,000 maps [17] [19]. -2) Reducer -Reducer reduces a set of intermediate values which share a key to a smaller set of values. Reducer has 3 primary phases: shuffle, sort and reduce. -2.1) Shuffle -Input to the Reducer is the sorted output of the mappers. In this phase the framework fetches the relevant partition of the output of all the mappers, via HTTP. -2.2) Sort -The framework groups Reducer inputs by keys (since different mappers may have output the same key) in this stage. The shuffle and sort phases occur simultaneously; while map-outputs are being fetched they are merged. -2.3) Secondary Sort -If equivalence rules for grouping the intermediate keys are required to be different from those for grouping keys before reduction, then one may specify a Comparator (Secondary Sort ). -2.4) Reduce -In this phase the reduce method is called for each pair in the grouped inputs.The output of the reduce task is typically written to the File System via Output Collector[19]. -Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. The output of the Reducer is not sorted. The right number of reduces seems to be 0.95 or 1.75 multiplied by no. of nodes. With 0.95 all of the reduces can launch immediately and start transferring map outputs as the maps finish. With 1.75 the faster nodes will finish their first round of reduces and launch a second wave of reduces doing a much better job of load balancing [MR Framework].Increasing the number of reduces increases the framework overhead, but increases load balancing and lowers the cost of failures. The scaling factors above are slightly less than whole numbers to reserve a few reduce slots in the framework for speculative-tasks and failed tasks. It is legal to set the number of reduce-tasks to zero if no reduction is desired. -a) Partitioner -Partitioner partitions the key space. Partitioner controls the partitioning of the keys of the intermediate map-outputs. The key (or a subset of the key) is used to derive the partition, typically by a hash function. The total number of partitions is the same as the number of reduce tasks for the job. Hence this controls which of the m reduce tasks the intermediate key (and hence the record) is sent to for reduction. -Hash Partitioner is the default Partitioner. -b) Reporter -Reporter is a facility for MapReduce applications to report progress, set application-level status messages and update Counters.Mapper and Reducer implementations can use the Reporter to report progress or just indicate that they are alive. In scenarios where the application takes a significant amount of time to process individual key/value pairs, this is crucial since the framework might assume that the task has timed-out and kill that task. Applications can also update Counters using the Reporter. -c) Output Collector -Output Collector is a generalization of the facility provided by the MapReduce framework to collect data output by the Mapper or the Reducer (either the intermediate outputs or the -Name Node–manages HDFS metadata, doesn’t deal with files directly -Data Node –stores blocks of HDFS –default replication level for each block: 3 -Job Tracker –schedules, allocates and monitors job execution on slaves –Task Trackers -Task Tracker –runs Map Reduce operations -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -274 -output of the job). HadoopMapReduce comes bundled with a library of generally useful mappers, reducers, and partitioners -Figure 7. Map Reduce Working through Master / Slave -C. Map Reduce techniques - Combining -Combiners provide a general mechanism within the MapReduce framework to reduce the amount of intermediate data generated by the mappers. They can be understood as "mini-reducers" that process the output of mappers. The combiner's aggregate term counts across the documents processed by each map task. This result in a reduction in the number of intermediate key-value pairs that need to be shuffled across the network, from the order of total number of terms in the collection to the order of the number of unique terms in the collection. They reduce the result size of map functions and perform reduce-like function in each machine which decreases the shuffling cost. - Inverse Indexing -Inverse indexing is a technique in which the keywords of the documents are mapped according to the document keys in which they are residing. For example Doc1: IMF, Financial Economics Crisis Doc2: IMF, Financial Crisis Doc3: Harry Economics Doc4: Financial Harry Potter Film Doc5: Harry Potter Crisis The following is the inverted index of the above data IMF -> Doc1:1, Doc2:1 Financial -> Doc1:6, Doc2:6, Doc4:1 Economics -> Doc1:16, Doc3:7 Crisis -> Doc1:26, Doc2:16, Doc5:14 Harry -> Doc3:1, Doc4:11, Doc5:1 Potter -> Doc4:17, Doc5:7 Film -> Doc4:24 - Shuffling -Shuffling is the procedure of mixing the indexes of the files and their keys, so that a heterogeneous mix of dataset can be obtained. If the dataset is shuffled, then there are better chances that the resultant query processing will yield near accurate results. We can relate the shuffling process with the population generating by crossover in the GA algorithms. The processes are different in nature, but their purpose is similar.[7] - Sharding -It is a term used to distribute the Mappers in the HDFS architecture. Sharding refers to the groupings or documents which are done so that the MapReduce jobs are done parallel in a distributed environment. - Joins -Join is a RDBMS term; it refers to combining two or more discrete datasets to get Cartesian product of data of all the possible combinations. Map Reduce does not have its own Join techniques, but RDBMS techniques are tweaked and used to get the maximum possible combinations.The join techniques which are adopted for Map Reduce are Equi Join, Self Join, Repartition Join and Theta Join [7][10-11]. - Clustering & Classification -They are Data Analysis term, used mainly in Data Mining. In Map Reduce it is achieved through K means clustering [7]. Here, iterative working improves partitioning of data into k clusters. After the clustering, the data sorted are grouped together based upon rules to be formed into classes. The steps for clustering in Map Reduce are; Step1: Do Step2: Map Step3: Input is a data point and k centres are broadcasted Step4: Finds the closest centre among k centres for the input point Step5: Reduce Step6: Input is one of k centres and all data points having this centre as their closest centre Step7: Calculates the new centre using data points Step 8: Repeat 1-7, until all of new centres are not changed -VI. CONCLUSION -The need to process enormous quantities of data has never been greater. Not only are terabyte- and petabyte-scale datasets rapidly becoming commonplace, but there is consensus that great value lies buried in them, waiting to be unlocked by the right computational tools. In the commercial sphere, business intelligence, driven by the ability to gather data from a dizzying array of sources. Big Data analysis tools like Map Reduce over Hadoop and HDFS, promises to help organizations better understand their customers and the marketplace, hopefully leading to better business decisions and competitive advantages [6]. For engineers building information processing tools and applications, large and heterogeneous datasets which are generating continuous flow of data, lead to more effective algorithms for a wide range of tasks, from machine translation -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -275 -to spam detection. In the natural and physical sciences, the ability to analyse massive amounts of data may provide the key to unlocking the secrets of the cosmos or the mysteries of life. MapReduce can be exploited to solve a variety of problems related to text processing at scales that would have been unthinkable a few years ago [15]. No tool no matter how powerful or flexible can be perfectly adapted to every task. There are many examples of algorithms that depend crucially on the existence of shared global state during processing, making them difficult to implement in MapReduce (since the single opportunity for global synchronization in MapReduce is the barrier between the map and reduce phases of processing). Implementing online learning algorithms in MapReduce is problematic [14]. The model parameters in a learning algorithm can be viewed as shared global state, which must be updated as the model is evaluated against training data. All processes performing the evaluation (presumably the mappers) must have access to this state. In a batch learner, where updates occur in one or more reducers (or, alternatively, in the driver code), synchronization of this resource is enforced by the MapReduce framework. However, with online learning, these updates must occur after processing smaller numbers of instances. This means that the framework must be altered to support faster processing of smaller datasets, which goes against the design choices of most existing MapReduce implementations. Since MapReduce was specifically optimized for batch operations over large amounts of data, such a style of computation would likely result in insufficient use of resources [2]. In Hadoop, for example, map and reduce tasks have considerable start-up costs. -VII. ADVANCEMENTS -Streaming algorithms [9] represent an alternative programming model for dealing with large volumes of data with limited computational and storage resources. This model assumes that data are presented to the algorithm as one or more streams of inputs that are processed in order, and only once. Stream processing is very attractive for working with time-series data (news feeds, tweets, sensor readings, etc.), which is difficult in MapReduce (once again, given its batch-oriented design). Another system worth mentioning is Pregel [16], which implements a programming model inspired by Valiant's Bulk Synchronous Parallel (BSP) model. Pregel was specially designed for large-scale graph algorithms, but unfortunately there are few published details at present. -Pig [28], which is inspired by Google [13], can be described as a data analytics platform that provides a lightweight scripting language for manipulating large datasets. Although Pig scripts (in a language called Pig Latin) are ultimately converted into Hadoop jobs by Pig's execution engine through joins, allow developers to specify data transformations (filtering, joining, grouping, etc.) at a much higher level. Similarly, Hive [20], another open-source project, provides an abstraction on top of Hadoop that allows users to issue SQL queries against large relational datasets stored in HDFS. Hive queries, in HiveQL are compiled down to Hadoop jobs by the Hive query engine. Therefore, the system provides a data analysis tool for users who are already comfortable with relational databases, while simultaneously taking advantage of Hadoop's data processing capabilities [11]. The power of MapReduce derives from providing an abstraction that allows developers to harness the power of large clusters but abstractions manage complexity by hiding details and presenting well-defined behaviours to users of those abstractions. This process makes certain tasks easier, but others more difficult, if not impossible. MapReduce is certainly no exception to this generalization, even within the Hadoop/HDFS/MapReduceecosystem; it is already observed the development of alternative approaches for expressing distributed computations. For example, there can be a third merge phase after map and reduce to better support relational operations. Join processing mentioned n the paper can also tackle the Map Reduce tasks effectively. The future directions in Big Data analysis gives a very encouraging picture as the tools are build on the existing paradigm of HDFS and Hadoop, overcoming the existing drawback of the present systems and the advantages it provides over the traditional data analysis tools. -REFERENCES -[1] Jefry Dean and Sanjay Ghemwat, MapReduce:A Flexible Data Processing Tool, Communications of the ACM, Volume 53, Issuse.1,January 2010, pp 72-77. -[2] Jefry Dean and Sanjay Ghemwat,.MapReduce: Simplified data processing on large clusters, Communications of the ACM, Volume 51 pp. 107–113, 2008 -[3] Brad Brown, Michael Chui, and James Manyika, Are you ready for the era of „big data‟?,McKinseyQuaterly,Mckinsey Global Institute, October 2011. -[4] DunrenChe, MejdlSafran, and ZhiyongPeng, From Big Data to Big Data Mining: Challenges, Issues, and Opportunities, DASFAA Workshops 2013, LNCS 7827, pp. 1–15, 2013. -[5] MarcinJedyk, MAKING BIG DATA, SMALL, Using distributed systems for processing, analysing and managing large huge data sets, Software Professional‟s Network, Cheshire Data systems Ltd. -[6] OnurSavas, YalinSagduyu, Julia Deng, and Jason Li,Tactical Big Data Analytics: Challenges, Use Cases and Solutions, Big Data Analytics Workshop in conjunction with ACM Sigmetrics 2013,June 21, 2013. -[7] Kyuseok Shim, MapReduce Algorithms for Big Data Analysis, DNIS 2013, LNCS 7813, pp. 44–48, 2013. -[8] Raja.Appuswamy,ChristosGkantsidis,DushyanthNarayanan,OrionHodson,AntonyRowstron, Nobody ever got fired for buying a cluster, Microsoft Research, Cambridge, UK, Technical Report,MSR-TR-2013-2 -[9] Carlos Ordonez, Algorithms and Optimizations for Big Data Analytics: Cubes, Tech Talks,University of Houston, USA. -[10] Spyros Blanas, Jignesh M. Patel,VukErcegovac, Jun Rao,Eugene J. Shekita, YuanyuanTian, A Comparison of Join Algorithms for Log Processing in MapReduce, SIGMOD‟10, June 6–11, 2010, Indianapolis, Indiana, USA. -[11] Tyson Condie, Neil Conway, Peter Alvaro, Joseph M. Hellerstein,JohnGerth, Justin Talbot,KhaledElmeleegy, Russell Sears, Online Aggregation and Continuous Query support in -International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV -276 -MapReduce, SIGMOD‟10, June 6–11, 2010, Indianapolis, Indiana, USA. -[12] J. Dean and S. Ghemawat, “MapReduce: Simplified data processing on large clusters,” in USENIXSymposium on Operating Systems Design and Implementation, San Francisco, CA, Dec. 2004, pp. 137–150. -[13] S. Ghemawat, H. Gobioff, and S. Leung, “The Google File System.” in ACM Symposium on Operating Systems Principles, Lake George, NY, Oct 2003, pp. 29 – 43. -[14] HADOOP-3759: Provide ability to run memory intensive jobs without affecting other running tasks on the nodes. https://issues.apache.org/jira/browse/HADOOP-3759 -[15] VinayakBorkar, Michael J. Carey, Chen Li, Inside “Big Data Management”:Ogres, Onions, or Parfaits?, EDBT/ICDT 2012 Joint Conference Berlin, Germany,2012 ACM 2012, pp 3-14. -[16] GrzegorzMalewicz, Matthew H. Austern, Aart J. C. Bik, James C.Dehnert, Ilan Horn, NatyLeiser, and GrzegorzCzajkowski,Pregel: A System for Large-Scale Graph Processing, SIGMOD‟10, June 6–11, 2010, pp 135-145. -[17] Hadoop,“PoweredbyHadoop,”http://wiki.apache.org/hadoop/PoweredBy. -[18] PIGTutorial,YahooInc., http://developer.yahoo.com/hadoop/tutorial/pigtutorial.html -[19] Apache: Apache Hadoop, http://hadoop.apache.org -[20] Apache Hive, http://hive.apache.org/ -[21] Apache Giraph Project, http://giraph.apache.org/ -[22] Mahout, http://lucene.apache.org/mahout/ -[23] Amazon Simple Storage Service (Amazon S3). http://aws.amazon.com/s3/ -[24] Windows.Azure.Storage.http://www.microsoft.com/windowsazure/features/storage/ -[25] The Age of Big Data. Steve Lohr. New York Times, Feb 11, 2012. http://www.nytimes.com/2012/02/12/sunday-review/big-datas-impact-in-the-world.html -[26] Information System & Management, ISM Book, 1st Edition 2010, EMC2, Wiley Publishing -[27] Dryad - Microsoft Research, http://research.microsoft.com/en-us/projects/dryad/ -[28] IBM-What.is.Jaql, www.ibm.com/software/data/infosphere/hadoop/jaql/ View publication stats \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls deleted file mode 100644 index 039b248ff187b904751eb0062c2edc367965f076..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt deleted file mode 100644 index 916b72dd1136ac5988e6f9948a37bcb8ee509848..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt +++ /dev/null @@ -1,4 +0,0 @@ -copyright � 2018 elsevier b.v.©right � 2018 elsevier b.v.&依赖 -its&licensors& -registered trademark&elsevier b.v. term and condition&AGGREGATION -ScienceDirect �&elsevier b.v. term and condition&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt deleted file mode 100644 index eccf8ce03f201707c2203ff3c50d6a62208ec746..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt +++ /dev/null @@ -1,43 +0,0 @@ -MapReduce -Related terms: -Extreme Learning Machine, Cloud Computing, Hadoop, Dataset, Programming -Model -View all Topics -Big Data Management on Wireless Sensor -Networks -In Big Data Analytics for Sensor-Network Collected Intelligence, 2017 -> Read full chapter -Big Data Analytics Challenges and Solutions -In Big Data Analytics for Intelligent Healthcare Management, 2019 -> Read full chapter -Big data principles and paradigm -In Ocean Energy Modeling and Simulation with Big Data, 2020 -> Read full chapter -Extreme Learning Machine and Its Applications -in Big Data Processing -In Big Data Analytics for Sensor-Network Collected Intelligence, 2017 -> Read full chapter -Energy Efficiency in Data Centers and -Clouds -In Advances in Computers, 2016 -> Read full chapter -Climate Analytics as a Service -In Cloud Computing in Ocean and Atmospheric Sciences, 2016 -> Read full chapter -A Deep Dive into NoSQL Databases: -The Use Cases and Applications -In Advances in Computers, 2018 -> Read full chapter -Hadoop in the Cloud to Analyze Climate -Datasets -In Cloud Computing in Ocean and Atmospheric Sciences, 2016 -> Read full chapter -Ocean energy data learning from big -data -In Ocean Energy Modeling and Simulation with Big Data, 2020 -> Read full chapter -Connected Computing Environment -In Advances in Computers, 2013 -> Read full chapter -ScienceDirect is Elsevier’s leading information solution for researchers. -Copyright © 2018 Elsevier B.V. or its licensors or contributors. ScienceDirect ® is a registered trademark of Elsevier B.V. Terms and conditions apply. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls deleted file mode 100644 index f9484f874a6ec1ea9b327478269900ead4bca34d..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt deleted file mode 100644 index 5d5debe8d6d827c54c7c77a01fb8ca31c3065f4d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt +++ /dev/null @@ -1,176 +0,0 @@ -huge amount&datum&AGGREGATION -we&application&依赖 -we&which&依赖 -we&large cluster&依赖 -we&huge amount&依赖 -large cluster&commodity hardware&AGGREGATION -we&reliable manner&依赖 -MapReduce algorithm&two important task&依赖 -MapReduce algorithm&namely map&依赖 -Map&set&依赖 -set&datum&AGGREGATION -Map&datum&依赖 -smaller set&tuple&AGGREGATION -sequence&name MapReduce&AGGREGATION -major advantage&MapReduce&AGGREGATION -thousand&machine&AGGREGATION -we&application&依赖 -we&MapReduce form&依赖 -simple scalability&many programmer&依赖 -simple scalability&MapReduce model&依赖 -MapReduce program&three stage&依赖 -MapReduce program&program&GENERALIZATION -form&file or directory&AGGREGATION -input datum&file or directory&依赖 -input file&file&GENERALIZATION -input file&line&依赖 -input file&mapper function line&依赖 -mapper&datum&依赖 -several small chunk&datum&AGGREGATION -combination&Shuffle stage&AGGREGATION -Reducer ’s job&datum&依赖 -it&new set&依赖 -it&output&依赖 -it&new set&依赖 -it&output&依赖 -new set&output&AGGREGATION -Hadoop&appropriate server&依赖 -Hadoop&Map and Reduce task&依赖 -Hadoop&Map and Reduce task&依赖 -Hadoop&cluster&依赖 -framework&detail&依赖 -detail&data-passing such as&AGGREGATION -framework&data-passing such as&依赖 -framework&task&依赖 -local disk&network traffic&依赖 -Most&computing&AGGREGATION -Most&place&依赖 -completion&given task&AGGREGATION -cluster&given task&依赖 -cluster&completion&依赖 -set&< key , value > pair&AGGREGATION -job&different type&AGGREGATION -MapReduce framework&framework&GENERALIZATION -framework&input&依赖 -MapReduce framework&< key&依赖 -framework&set&依赖 -framework&input&依赖 -output&job&AGGREGATION -framework&< key , value > pair&依赖 -key&framework&依赖 -key class&Writable-Comparable interface&实现 -Input and Output type&a mapreduce job − ( input )&AGGREGATION -Input and Output type&v3 > ( output )&依赖 -Input and Output type&v2 > → reduce → < k3&依赖 -core&job&AGGREGATION -Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 -Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 -Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 -Mapper − Mapper&set&依赖 -Mapper − Mapper&input key/value pair&依赖 -set&intermediate key/value pair&AGGREGATION -Mapper − Mapper&intermediate key/value pair&依赖 -namednode − node&file system ( hdf )&依赖 -datum&advance&依赖 -datanode − node&datanode − node&依赖 -processing&place&依赖 -masternode − node&masternode − node&依赖 -slavenode − node&slavenode − node&依赖 -jobtracker − schedules job&jobtracker − schedules job&依赖 -Task Tracker −&task and report status&依赖 -Task Tracker −&JobTracker&依赖 -program&dataset&依赖 -program&dataset&依赖 -program&Mapper and Reducer&依赖 -execution&Mapper and Reducer&AGGREGATION -task −&Mapper&依赖 -task −&Mapper&依赖 -task −&execution&依赖 -task −&Mapper&依赖 -task −&Mapper&依赖 -task −&execution&依赖 -task −&execution&依赖 -task −&execution&依赖 -task −&execution&依赖 -task −&Mapper&依赖 -task −&execution&依赖 -task −&Mapper&依赖 -task −&execution&依赖 -task −&execution&依赖 -task −&Mapper&依赖 -task −&execution&依赖 -task −&Mapper&依赖 -task −&Mapper&依赖 -execution&Mapper&AGGREGATION -slice&datum&AGGREGATION -task attempt −&particular instance&依赖 -task attempt −&an attempt&AGGREGATION -electrical consumption&organization&AGGREGATION -It&monthly electrical consumption&依赖 -we&application&依赖 -above datum&input&依赖 -year&maximum usage and year&AGGREGATION -year&minimum usage&AGGREGATION -finite number&record&AGGREGATION -They&required output&依赖 -They&logic&依赖 -electrical consumption&largescale industry&AGGREGATION -largescale industry&particular state&AGGREGATION -its&formation& -we&such bulk datum&依赖 -They&time&依赖 -They&lot&依赖 -lot&time&AGGREGATION -we&datum&依赖 -we&source&依赖 -we&network server&依赖 -we&MapReduce framework&依赖 -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&MapReduce framework&依赖 -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&sample datum&依赖 -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&sample datum&依赖 -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&MapReduce framework&依赖 -/ / Reducer class public static class E_EReduce extends MapReduceBase&IntWritable > {&实现 -/ / Reducer class public static class E_EReduce extends MapReduceBase&Reducer < Text and IntWritable and Text&实现 -compilation and execution&program&AGGREGATION -home directory&Hadoop user&AGGREGATION -Compilation and Execution&Process Units Program&AGGREGATION -we&Hadoop user&依赖 -we&home/hadoop )&依赖 -Step 1&directory&依赖 -Step 1&compiled java class&依赖 -Step 1&compiled java class&依赖 -Step 1&directory&依赖 -$ mkdir unit&2 Download Hadoop-core-1.2.1.jar&依赖 -follow link mvnrepository.com&jar&依赖 -input_dir step&5&依赖 -input directory&hdf&AGGREGATION -$ HADOOP_HOME / bin/hadoop jar units.jar hadoop.ProcessUnits&while&依赖 -$ HADOOP_HOME / bin/hadoop jar units.jar hadoop.ProcessUnits&Wait&依赖 -output&number&依赖 -output&number&依赖 -output&input split&依赖 -number&input split&AGGREGATION -number&Map task&AGGREGATION -output&input split&依赖 -number&reducer task&AGGREGATION -FILE&large read operation&AGGREGATION -Number&write operation&AGGREGATION -Number&byte&AGGREGATION -Number&read operation&AGGREGATION -Number&large read operation&AGGREGATION -File&Counters& -file&hdf&依赖 -Hadoop command&$ HADOOP_HOME / bin/hadoop command&依赖 -cat output_dir / part-00000 / bin/hadoop df&output_dir / home/hadoop Important command&依赖 -table&option&依赖 -their&description& -20 distcp &20 distcp &依赖 -class path&Hadoop jar&依赖 -Hadoop jar&jar&GENERALIZATION -events&details& -- list&job&依赖 -Killed task&failed attempt&依赖 -Failed task&failed attempt&依赖 -priority&job&AGGREGATION -history&bin/hadoop job&AGGREGATION -status&bin/hadoop job&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt deleted file mode 100644 index b0b29020667b0edc30547955405aa2147b0bef79..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt +++ /dev/null @@ -1,469 +0,0 @@ -MapReduce is a framework using which we can write applications to process huge amounts of data, in parallel, on large clusters of commodity hardware in a reliable manner. - -What is MapReduce? -MapReduce is a processing technique and a program model for distributed computing based on java. The MapReduce algorithm contains two important tasks, namely Map and Reduce. Map takes a set of data and converts it into another set of data, where individual elements are broken down into tuples (key/value pairs). Secondly, reduce task, which takes the output from a map as an input and combines those data tuples into a smaller set of tuples. As the sequence of the name MapReduce implies, the reduce task is always performed after the map job. - -The major advantage of MapReduce is that it is easy to scale data processing over multiple computing nodes. Under the MapReduce model, the data processing primitives are called mappers and reducers. Decomposing a data processing application into mappers and reducers is sometimes nontrivial. But, once we write an application in the MapReduce form, scaling the application to run over hundreds, thousands, or even tens of thousands of machines in a cluster is merely a configuration change. This simple scalability is what has attracted many programmers to use the MapReduce model. - -The Algorithm -Generally MapReduce paradigm is based on sending the computer to where the data resides! - -MapReduce program executes in three stages, namely map stage, shuffle stage, and reduce stage. - -Map stage − The map or mapper’s job is to process the input data. Generally the input data is in the form of file or directory and is stored in the Hadoop file system (HDFS). The input file is passed to the mapper function line by line. The mapper processes the data and creates several small chunks of data. - -Reduce stage − This stage is the combination of the Shuffle stage and the Reduce stage. The Reducer’s job is to process the data that comes from the mapper. After processing, it produces a new set of output, which will be stored in the HDFS. - -During a MapReduce job, Hadoop sends the Map and Reduce tasks to the appropriate servers in the cluster. - -The framework manages all the details of data-passing such as issuing tasks, verifying task completion, and copying data around the cluster between the nodes. - -Most of the computing takes place on nodes with data on local disks that reduces the network traffic. - -After completion of the given tasks, the cluster collects and reduces the data to form an appropriate result, and sends it back to the Hadoop server. - -MapReduce Algorithm -Inputs and Outputs (Java Perspective) -The MapReduce framework operates on pairs, that is, the framework views the input to the job as a set of pairs and produces a set of pairs as the output of the job, conceivably of different types. - -The key and the value classes should be in serialized manner by the framework and hence, need to implement the Writable interface. Additionally, the key classes have to implement the Writable-Comparable interface to facilitate sorting by the framework. Input and Output types of a MapReduce job − (Input) → map → → reduce → (Output). - -Input Output -Map list () -Reduce list () -Terminology -PayLoad − Applications implement the Map and the Reduce functions, and form the core of the job. - -Mapper − Mapper maps the input key/value pairs to a set of intermediate key/value pair. - -NamedNode − Node that manages the Hadoop Distributed File System (HDFS). - -DataNode − Node where data is presented in advance before any processing takes place. - -MasterNode − Node where JobTracker runs and which accepts job requests from clients. - -SlaveNode − Node where Map and Reduce program runs. - -JobTracker − Schedules jobs and tracks the assign jobs to Task tracker. - -Task Tracker − Tracks the task and reports status to JobTracker. - -Job − A program is an execution of a Mapper and Reducer across a dataset. - -Task − An execution of a Mapper or a Reducer on a slice of data. - -Task Attempt − A particular instance of an attempt to execute a task on a SlaveNode. - -Example Scenario -Given below is the data regarding the electrical consumption of an organization. It contains the monthly electrical consumption and the annual average for various years. - -Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Avg -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 -1980 26 27 28 28 28 30 31 31 31 30 30 30 29 -1981 31 32 32 32 33 34 35 36 36 34 34 34 34 -1984 39 38 39 39 39 41 42 43 40 39 38 38 40 -1985 38 39 39 39 39 41 41 41 00 40 39 39 45 -If the above data is given as input, we have to write applications to process it and produce results such as finding the year of maximum usage, year of minimum usage, and so on. This is a walkover for the programmers with finite number of records. They will simply write the logic to produce the required output, and pass the data to the application written. - -But, think of the data representing the electrical consumption of all the largescale industries of a particular state, since its formation. - -When we write applications to process such bulk data, - -They will take a lot of time to execute. - -There will be a heavy network traffic when we move data from source to network server and so on. - -To solve these problems, we have the MapReduce framework. - -Input Data -The above data is saved as sample.txtand given as input. The input file looks as shown below. - -1979 23 23 2 43 24 25 26 26 26 26 25 26 25 -1980 26 27 28 28 28 30 31 31 31 30 30 30 29 -1981 31 32 32 32 33 34 35 36 36 34 34 34 34 -1984 39 38 39 39 39 41 42 43 40 39 38 38 40 -1985 38 39 39 39 39 41 41 41 00 40 39 39 45 -Example Program -Given below is the program to the sample data using MapReduce framework. - -package hadoop; - -import java.util.*; - -import java.io.IOException; -import java.io.IOException; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.conf.*; -import org.apache.hadoop.io.*; -import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.*; - -public class ProcessUnits { - //Mapper class - public static class E_EMapper extends MapReduceBase implements - Mapper /*Output value Type*/ - { - //Map function - public void map(LongWritable key, Text value, - OutputCollector output, - - Reporter reporter) throws IOException { - String line = value.toString(); - String lasttoken = null; - StringTokenizer s = new StringTokenizer(line,"\t"); - String year = s.nextToken(); - - while(s.hasMoreTokens()) { - lasttoken = s.nextToken(); - } - int avgprice = Integer.parseInt(lasttoken); - output.collect(new Text(year), new IntWritable(avgprice)); - } - } - - //Reducer class - public static class E_EReduce extends MapReduceBase implements Reducer< Text, IntWritable, Text, IntWritable > { - - //Reduce function - public void reduce( Text key, Iterator values, - OutputCollector output, Reporter reporter) throws IOException { - int maxavg = 30; - int val = Integer.MIN_VALUE; - - while (values.hasNext()) { - if((val = values.next().get())>maxavg) { - output.collect(key, new IntWritable(val)); - } - } - } - } - - //Main function - public static void main(String args[])throws Exception { - JobConf conf = new JobConf(ProcessUnits.class); - - conf.setJobName("max_eletricityunits"); - conf.setOutputKeyClass(Text.class); - conf.setOutputValueClass(IntWritable.class); - conf.setMapperClass(E_EMapper.class); - conf.setCombinerClass(E_EReduce.class); - conf.setReducerClass(E_EReduce.class); - conf.setInputFormat(TextInputFormat.class); - conf.setOutputFormat(TextOutputFormat.class); - - FileInputFormat.setInputPaths(conf, new Path(args[0])); - FileOutputFormat.setOutputPath(conf, new Path(args[1])); - - JobClient.runJob(conf); - } -} -Save the above program as ProcessUnits.java. The compilation and execution of the program is explained below. - -Compilation and Execution of Process Units Program -Let us assume we are in the home directory of a Hadoop user (e.g. /home/hadoop). - -Follow the steps given below to compile and execute the above program. - -Step 1 -The following command is to create a directory to store the compiled java classes. - -$ mkdir units -Step 2 -Download Hadoop-core-1.2.1.jar, which is used to compile and execute the MapReduce program. Visit the following link mvnrepository.com to download the jar. Let us assume the downloaded folder is /home/hadoop/. - -Step 3 -The following commands are used for compiling the ProcessUnits.java program and creating a jar for the program. - -$ javac -classpath hadoop-core-1.2.1.jar -d units ProcessUnits.java -$ jar -cvf units.jar -C units/ . -Step 4 -The following command is used to create an input directory in HDFS. - -$HADOOP_HOME/bin/hadoop fs -mkdir input_dir -Step 5 -The following command is used to copy the input file named sample.txtin the input directory of HDFS. - -$HADOOP_HOME/bin/hadoop fs -put /home/hadoop/sample.txt input_dir -Step 6 -The following command is used to verify the files in the input directory. - -$HADOOP_HOME/bin/hadoop fs -ls input_dir/ -Step 7 -The following command is used to run the Eleunit_max application by taking the input files from the input directory. - -$HADOOP_HOME/bin/hadoop jar units.jar hadoop.ProcessUnits input_dir output_dir -Wait for a while until the file is executed. After execution, as shown below, the output will contain the number of input splits, the number of Map tasks, the number of reducer tasks, etc. - -INFO mapreduce.Job: Job job_1414748220717_0002 -completed successfully -14/10/31 06:02:52 -INFO mapreduce.Job: Counters: 49 - File System Counters - -FILE: Number of bytes read = 61 -FILE: Number of bytes written = 279400 -FILE: Number of read operations = 0 -FILE: Number of large read operations = 0 -FILE: Number of write operations = 0 -HDFS: Number of bytes read = 546 -HDFS: Number of bytes written = 40 -HDFS: Number of read operations = 9 -HDFS: Number of large read operations = 0 -HDFS: Number of write operations = 2 Job Counters - - - Launched map tasks = 2 - Launched reduce tasks = 1 - Data-local map tasks = 2 - Total time spent by all maps in occupied slots (ms) = 146137 - Total time spent by all reduces in occupied slots (ms) = 441 - Total time spent by all map tasks (ms) = 14613 - Total time spent by all reduce tasks (ms) = 44120 - Total vcore-seconds taken by all map tasks = 146137 - Total vcore-seconds taken by all reduce tasks = 44120 - Total megabyte-seconds taken by all map tasks = 149644288 - Total megabyte-seconds taken by all reduce tasks = 45178880 - -Map-Reduce Framework - - Map input records = 5 - Map output records = 5 - Map output bytes = 45 - Map output materialized bytes = 67 - Input split bytes = 208 - Combine input records = 5 - Combine output records = 5 - Reduce input groups = 5 - Reduce shuffle bytes = 6 - Reduce input records = 5 - Reduce output records = 5 - Spilled Records = 10 - Shuffled Maps = 2 - Failed Shuffles = 0 - Merged Map outputs = 2 - GC time elapsed (ms) = 948 - CPU time spent (ms) = 5160 - Physical memory (bytes) snapshot = 47749120 - Virtual memory (bytes) snapshot = 2899349504 - Total committed heap usage (bytes) = 277684224 - -File Output Format Counters - - Bytes Written = 40 -Step 8 -The following command is used to verify the resultant files in the output folder. - -$HADOOP_HOME/bin/hadoop fs -ls output_dir/ -Step 9 -The following command is used to see the output in Part-00000 file. This file is generated by HDFS. - -$HADOOP_HOME/bin/hadoop fs -cat output_dir/part-00000 -Below is the output generated by the MapReduce program. - -1981 34 -1984 40 -1985 45 -Step 10 -The following command is used to copy the output folder from HDFS to the local file system for analyzing. - -$HADOOP_HOME/bin/hadoop fs -cat output_dir/part-00000/bin/hadoop dfs get output_dir /home/hadoop -Important Commands -All Hadoop commands are invoked by the $HADOOP_HOME/bin/hadoop command. Running the Hadoop script without any arguments prints the description for all commands. - -Usage − hadoop [--config confdir] COMMAND - -The following table lists the options available and their description. - -Sr.No. Option & Description -1 -namenode -format - -Formats the DFS filesystem. - -2 -secondarynamenode - -Runs the DFS secondary namenode. - -3 -namenode - -Runs the DFS namenode. - -4 -datanode - -Runs a DFS datanode. - -5 -dfsadmin - -Runs a DFS admin client. - -6 -mradmin - -Runs a Map-Reduce admin client. - -7 -fsck - -Runs a DFS filesystem checking utility. - -8 -fs - -Runs a generic filesystem user client. - -9 -balancer - -Runs a cluster balancing utility. - -10 -oiv - -Applies the offline fsimage viewer to an fsimage. - -11 -fetchdt - -Fetches a delegation token from the NameNode. - -12 -jobtracker - -Runs the MapReduce job Tracker node. - -13 -pipes - -Runs a Pipes job. - -14 -tasktracker - -Runs a MapReduce task Tracker node. - -15 -historyserver - -Runs job history servers as a standalone daemon. - -16 -job - -Manipulates the MapReduce jobs. - -17 -queue - -Gets information regarding JobQueues. - -18 -version - -Prints the version. - -19 -jar - -Runs a jar file. - -20 -distcp - -Copies file or directories recursively. - -21 -distcp2 - -DistCp version 2. - -22 -archive -archiveName NAME -p * - -Creates a hadoop archive. - -23 -classpath - -Prints the class path needed to get the Hadoop jar and the required libraries. - -24 -daemonlog - -Get/Set the log level for each daemon - -How to Interact with MapReduce Jobs -Usage − hadoop job [GENERIC_OPTIONS] - -The following are the Generic Options available in a Hadoop job. - -Sr.No. GENERIC_OPTION & Description -1 --submit - -Submits the job. - -2 --status - -Prints the map and reduce completion percentage and all job counters. - -3 --counter - -Prints the counter value. - -4 --kill - -Kills the job. - -5 --events <#-of-events> - -Prints the events' details received by jobtracker for the given range. - -6 --history [all] - history < jobOutputDir> - -Prints job details, failed and killed tip details. More details about the job such as successful tasks and task attempts made for each task can be viewed by specifying the [all] option. - -7 --list[all] - -Displays all jobs. -list displays only jobs which are yet to complete. - -8 --kill-task - -Kills the task. Killed tasks are NOT counted against failed attempts. - -9 --fail-task - -Fails the task. Failed tasks are counted against failed attempts. - -10 --set-priority - -Changes the priority of the job. Allowed priority values are VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW - -To see the status of job -$ $HADOOP_HOME/bin/hadoop job -status -e.g. -$ $HADOOP_HOME/bin/hadoop job -status job_201310191043_0004 -To see the history of job output-dir -$ $HADOOP_HOME/bin/hadoop job -history -e.g. -$ $HADOOP_HOME/bin/hadoop job -history /user/expert/output -To kill the job -$ $HADOOP_HOME/bin/hadoop job -kill -e.g. -$ $HADOOP_HOME/bin/hadoop job -kill job_201310191043_0004 \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls deleted file mode 100644 index 412406ce0d8413362275e906413495ce1ebadd88..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls and /dev/null differ diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" deleted file mode 100644 index 739e1aa773dbd933acd360bdc4007a51b8fcf466..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" +++ /dev/null @@ -1,346 +0,0 @@ -Hadoop Architecture&Big Data Course !!&依赖 -your&career& -Hadoop Architecture&Big Data Course !!&依赖 -design&Hadoop&AGGREGATION -design&various goal&依赖 -handling&large dataset&AGGREGATION -we&blog&依赖 -we&Hadoop Architecture&依赖 -we&detail&依赖 -we&Hadoop Architecture Diagram&依赖 -’s&Hadoop Architecture&依赖 -master-slave topology&topology&GENERALIZATION -Hadoop&master-slave topology&依赖 -we&one master node&依赖 -we&topology&依赖 -’s function&task&依赖 -’s function&various slave node&依赖 -node&function& -slave node&actual computing&依赖 -Slave node&real datum&依赖 -we&master&依赖 -metadata&what&依赖 -Hadoop Architecture&three major layer&依赖 -hdf ( hadoop&file system ) yarn mapreduce 1&依赖 -hdf ( hadoop&file system ) yarn mapreduce 1&依赖 -HDFS hdf&Hadoop Distributed File System&依赖 -data storage&Hadoop&AGGREGATION -hdf&data unit&依赖 -hdf&smaller unit&依赖 -It&two daemons run&依赖 -namenode and datanode hdfs&Master-slave architecture&依赖 -daemon&master server&依赖 -daemon&master server&依赖 -It&Namespace management&依赖 -DataNode daemon&slave node&依赖 -DataNode daemon&daemon&GENERALIZATION -file&data block&依赖 -file&number&依赖 -number&data block&AGGREGATION -group&slave machine&AGGREGATION -Namenode&system namespace&依赖 -Namenode&modification&依赖 -opening&files or directory&依赖 -NameNode&track&依赖 -NameNode&DataNodes&依赖 -mapping&block&AGGREGATION -NameNode&mapping&依赖 -track&mapping&AGGREGATION -DataNodes&read/write request&依赖 -DataNodes&file system ’s client&依赖 -DataNode&NameNode&依赖 -DataNode&delete&依赖 -DataNode&demand&依赖 -native language&hdf&AGGREGATION -Java&hdf&依赖 -one&machine&依赖 -one&DataNode and NameNode&依赖 -one&having&依赖 -one dedicated machine&typical deployment&依赖 -one dedicated machine&typical deployment&依赖 -other node&cluster run datanode&依赖 -other node&cluster run datanode&依赖 -NameNode&metada&依赖 -location&block&AGGREGATION -NameNode&metada&依赖 -NameNode&block&依赖 -NameNode&DataNodes&依赖 -You&hadoop high availability concept&依赖 -You&hadoop high availability concept&依赖 -smallest unit&storage&AGGREGATION -default block size&block size&GENERALIZATION -block size&size&GENERALIZATION -we&128MB&依赖 -we&default block size&依赖 -we&default block size&依赖 -we&128MB&依赖 -default block size&128MB&AGGREGATION -One&block size&依赖 -us&example&依赖 -us&file&依赖 -example&file&AGGREGATION -128mb then hdf&6 block&依赖 -our&size& -128mb then hdf&file&依赖 -128MB and one block&60MB&AGGREGATION -Five block&128MB and one block&AGGREGATION -we&size&依赖 -we&size&依赖 -we&file&依赖 -we&file&依赖 -file&size&AGGREGATION -we&file&依赖 -we&file&依赖 -we&size&依赖 -we&size&依赖 -we&numerous block&依赖 -4kb&block size&AGGREGATION -huge metada&NameNode&依赖 -Replication Management&replication technique&依赖 -copy&block and store&AGGREGATION -it©&依赖 -it&block and store&依赖 -it&block and store&依赖 -it©&依赖 -it&different datanode&依赖 -it&different datanode&依赖 -how many copy&block&AGGREGATION -we&value&依赖 -It&default&依赖 -file&1GB&AGGREGATION -we&file&依赖 -we&1GB&依赖 -replication factor&3&AGGREGATION -it&3gb&依赖 -3gb&total storage&AGGREGATION -it&total storage&依赖 -NameNode&block report&依赖 -NameNode&DataNode&依赖 -NameNode&replica&依赖 -rack&many DataNode machine&依赖 -hdf&block&依赖 -replica&block&AGGREGATION -hdf&replica&依赖 -hdf&rack awareness algorithm&依赖 -hdf&distributed fashion&依赖 -rack awareness algorithm&local rack&依赖 -rack awareness algorithm&first block&依赖 -It&more than two block&依赖 -It&same rack&依赖 -It&possible&依赖 -MapReduce MapReduce&Hadoop&依赖 -MapReduce MapReduce&MapReduce&GENERALIZATION -data processing layer&Hadoop&AGGREGATION -large amount&datum&AGGREGATION -MapReduce&cluster&依赖 -cluster&low-end machine&AGGREGATION -MapReduce&low-end machine&依赖 -MapReduce&application&依赖 -MapReduce&application&依赖 -It&reliable and fault-tolerant manner&依赖 -number&map task&AGGREGATION -MapReduce job&map task&依赖 -MapReduce job&number&依赖 -task&part&依赖 -part&datum&AGGREGATION -task&datum&依赖 -function&transform and filter datum&依赖 -function&Map task&AGGREGATION -sub-set&output&AGGREGATION -Reduce task&intermediate datum&依赖 -Reduce task&aggregation&依赖 -Reduce task&map task&依赖 -input file&hdf&依赖 -input file&file&GENERALIZATION -MapReduce job&job&GENERALIZATION -input file&hdf&依赖 -inputformat&input file&依赖 -byte-oriented view&chunk&AGGREGATION -chunk&input file&AGGREGATION -input split&map task&依赖 -map task&task&GENERALIZATION -map task&node&依赖 -RecordReader The recordreader&record&依赖 -RecordReader The recordreader&input split&依赖 -It&datum&依赖 -It&record&依赖 -mapper function&function&GENERALIZATION -datum&record&依赖 -mapper&phase&依赖 -mapper&key-value pair&依赖 -mapper&recordreader&依赖 -It&zero or multiple intermediate key-value pair&依赖 -decision&mapper function&依赖 -decision&mapper function&依赖 -reducer function&datum&依赖 -reducer function&operation&依赖 -reducer function&function&GENERALIZATION -Combiner&intermediate datum&依赖 -Combiner&mapper&依赖 -It&one mapper&依赖 -small scope&one mapper&AGGREGATION -It&small scope&依赖 -amount&datum&AGGREGATION -1 ) three time&more network bandwidth&依赖 -1 ) three time&example&依赖 -Partitioner Partitioner&intermediate key-value pair&依赖 -Partitioner Partitioner&mapper&依赖 -It&one shard&依赖 -It&them&依赖 -It&reducer&依赖 -It&them&依赖 -partitioner&hashcode&依赖 -hashcode&key&AGGREGATION -partitioner&key&依赖 -partitioner&hashcode&依赖 -partitioner&key&依赖 -partitioner&modulus operation&依赖 -partitioner&reducer )&依赖 -number&reducer&AGGREGATION -key.hashcode ( ) % ( number&reducer )&AGGREGATION -partitioner&modulus operation&依赖 -partitioner&key.hashcode ( ) % ( number&依赖 -partitioned datum&local file system&依赖 -partitioned datum&map task&依赖 -reducer&it&依赖 -reducer&shuffle and sort step&依赖 -this step download&written&依赖 -this step download&datum&依赖 -this step sort&individual data piece&依赖 -this step sort&individual data piece&依赖 -this step sort&large data list&依赖 -this step sort&large data list&依赖 -purpose&sort&AGGREGATION -purpose&equivalent key&依赖 -we&it&依赖 -framework&everything&依赖 -key&comparator object&依赖 -developer&control&依赖 -reducer&function&依赖 -reducer&key grouping&依赖 -framework&function key&依赖 -function key&key&GENERALIZATION -number&different way&AGGREGATION -We&reducer&依赖 -it&zero or more key-value pair&依赖 -it&solution&依赖 -core logic&solution&AGGREGATION -It&key-value pair&依赖 -It&reducer&依赖 -it&key and value&依赖 -it&tab&依赖 -it&tab&依赖 -it&key and value&依赖 -We&it&依赖 -YARN YARN&Hadoop&依赖 -YARN YARN&YARN&GENERALIZATION -resource management layer&Hadoop&AGGREGATION -basic principle&resource management and job scheduling/monitoring function&依赖 -basic principle&resource management and job scheduling/monitoring function&依赖 -one global ResourceManager and per-application ApplicationMaster&YARN&依赖 -Application&job&依赖 -single job&job&AGGREGATION -we&YARN framework&依赖 -YARN framework&framework&GENERALIZATION -we&two daemon resourcemanager and nodemanager&依赖 -ResourceManager&system&依赖 -ResourceManager&application&依赖 -ResourceManager&resource&依赖 -job&container&依赖 -job&resource usage&依赖 -job&NodeManger&AGGREGATION -ApplcationMaster&ResourceManager&依赖 -ApplcationMaster&resource&依赖 -ResourceManger&Scheduler&依赖 -ResourceManger&two important component&依赖 -it&tracking&依赖 -tracking&status&AGGREGATION -it&application&依赖 -it&tracking&依赖 -it&application&依赖 -It&task&依赖 -scheduler&resource&依赖 -requirement&application&AGGREGATION -function&ApplicationManager&AGGREGATION -Application Manager follow&ApplicationManager&依赖 -container&CPU , memory , disk , and network&依赖 -container&element&依赖 -function&ApplicationMaster&AGGREGATION -monitor progress&application&AGGREGATION -We&YARN&依赖 -We&YARN Federation feature&依赖 -We&few thousand node&依赖 -feature&multiple YARN cluster&依赖 -feature&us&依赖 -feature&Yarn YARN&AGGREGATION -feature&features :&依赖 -YARN&access engine&依赖 -variety&access engine&AGGREGATION -YARN&open-source or propriety )&依赖 -YARN&variety&依赖 -YARN&resource&依赖 -YARN&cluster&依赖 -YARN&dynamic allocation&依赖 -YARN&good use&依赖 -dynamic allocation&resource&AGGREGATION -good use&cluster&AGGREGATION -previous version&Hadoop&AGGREGATION -lesser utilization&cluster&AGGREGATION -YARN ’s ResourceManager&ever-expanding cluster&依赖 -petabyte&datum&AGGREGATION -YARN ’s ResourceManager&scheduling and cope&依赖 -YARN ’s ResourceManager&petabyte&依赖 -YARN ’s ResourceManager&datum&依赖 -MapReduce program&YARN&依赖 -MapReduce program&YARN&依赖 -people&idea&依赖 -people&Hadoop&依赖 -Hadoop&cheap storage and deep datum analysis&依赖 -this use jbod&Disk&依赖 -this use jbod&a bunch&依赖 -their&complexity& -Start Small and Keep Focus Many project&complexity and expense&依赖 -small cluster&node&AGGREGATION -infrastructure and development guy&internal working&依赖 -infrastructure and development guy&Hadoop&依赖 -internal working&Hadoop&AGGREGATION -Data Integration One&feature&AGGREGATION -feature&Hadoop&AGGREGATION -we&data structure&依赖 -We&flume and sqoop&依赖 -We&tool&依赖 -We&datum&依赖 -it&data integration process&依赖 -proper documentation&data source&AGGREGATION -they&cluster&依赖 -Use Compression Technique Enterprise&compression&依赖 -Use Compression Technique Enterprise&love-hate relationship&依赖 -it&performance&依赖 -compression&storage&依赖 -Hadoop&compression&依赖 -It&storage usage&依赖 -It&80 %&依赖 -different project&different requirement&依赖 -Apache Hadoop&Hadoop&GENERALIZATION -Apache Hadoop&wide ecosystem&依赖 -different project&different requirement&依赖 -it&itself&依赖 -design&Hadoop Architecture&AGGREGATION -Its&structure& -We&linearly&依赖 -MapReduce part&principle&依赖 -MapReduce part&principle&依赖 -principle&data locality&AGGREGATION -MapReduce part&design&AGGREGATION -MapReduce part&part&GENERALIZATION -MapReduce part&data locality&依赖 -MapReduce part&data locality&依赖 -Map-Reduce framework&datum&依赖 -Map-Reduce framework&computation close&依赖 -network traffic&major bandwidth&依赖 -overall architecture&Hadoop&AGGREGATION -your&Interview& -Hadoop Architecture&Hadoop Interview&依赖 -We&you&依赖 -You&Hadoop Architecture&依赖 -You&many question&依赖 diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" deleted file mode 100644 index 280390eb0afe3f18e843905c8d1d94dcb004b05e..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" +++ /dev/null @@ -1,161 +0,0 @@ -Hadoop Architecture in Detail – HDFS, Yarn & MapReduce -Boost your career with Big Data Get Exclusive Offers on Big Data Course!! -Hadoop now has become a popular solution for today’s world needs. The design of Hadoop keeps various goals in mind. These are fault tolerance, handling of large datasets, data locality, portability across heterogeneous hardware and software platforms etc. In this blog, we will explore the Hadoop Architecture in detail. Also, we will see Hadoop Architecture Diagram that helps you to understand it better. - -So, let’s explore Hadoop Architecture. - -Hadoop Architecture in Detail - HDFS, Yarn & MapReduce - -What is Hadoop Architecture? -Hadoop has a master-slave topology. In this topology, we have one master node and multiple slave nodes. Master node’s function is to assign a task to various slave nodes and manage resources. The slave nodes do the actual computing. Slave nodes store the real data whereas on master we have metadata. This means it stores data about data. What does metadata comprise that we will see in a moment? - -Hadoop Application Architecture in Detail - -Hadoop Architecture comprises three major layers. They are:- - -HDFS (Hadoop Distributed File System) -Yarn -MapReduce -1. HDFS -HDFS stands for Hadoop Distributed File System. It provides for data storage of Hadoop. HDFS splits the data unit into smaller units called blocks and stores them in a distributed manner. It has got two daemons running. One for master node – NameNode and other for slave nodes – DataNode. - -a. NameNode and DataNode -HDFS has a Master-slave architecture. The daemon called NameNode runs on the master server. It is responsible for Namespace management and regulates file access by the client. DataNode daemon runs on slave nodes. It is responsible for storing actual business data. Internally, a file gets split into a number of data blocks and stored on a group of slave machines. Namenode manages modifications to file system namespace. These are actions like the opening, closing and renaming files or directories. NameNode also keeps track of mapping of blocks to DataNodes. This DataNodes serves read/write request from the file system’s client. DataNode also creates, deletes and replicates blocks on demand from NameNode. - - - -Hadoop Architecture Diagram - -Java is the native language of HDFS. Hence one can deploy DataNode and NameNode on machines having Java installed. In a typical deployment, there is one dedicated machine running NameNode. And all the other nodes in the cluster run DataNode. The NameNode contains metadata like the location of blocks on the DataNodes. And arbitrates resources among various competing DataNodes. - -You must read about Hadoop High Availability Concept - -b. Block in HDFS -Block is nothing but the smallest unit of storage on a computer system. It is the smallest contiguous storage allocated to a file. In Hadoop, we have a default block size of 128MB or 256 MB. - -Hadoop Architecture Diagram - -One should select the block size very carefully. To explain why so let us take an example of a file which is 700MB in size. If our block size is 128MB then HDFS divides the file into 6 blocks. Five blocks of 128MB and one block of 60MB. What will happen if the block is of size 4KB? But in HDFS we would be having files of size in the order terabytes to petabytes. With 4KB of the block size, we would be having numerous blocks. This, in turn, will create huge metadata which will overload the NameNode. Hence we have to choose our HDFS block size judiciously. - -c. Replication Management -To provide fault tolerance HDFS uses a replication technique. In that, it makes copies of the blocks and stores in on different DataNodes. Replication factor decides how many copies of the blocks get stored. It is 3 by default but we can configure to any value. - -Hadoop Replication Factor - -The above figure shows how the replication technique works. Suppose we have a file of 1GB then with a replication factor of 3 it will require 3GBs of total storage. - -To maintain the replication factor NameNode collects block report from every DataNode. Whenever a block is under-replicated or over-replicated the NameNode adds or deletes the replicas accordingly. - -d. What is Rack Awareness? -Hadoop Architecture - -A rack contains many DataNode machines and there are several such racks in the production. HDFS follows a rack awareness algorithm to place the replicas of the blocks in a distributed fashion. This rack awareness algorithm provides for low latency and fault tolerance. Suppose the replication factor configured is 3. Now rack awareness algorithm will place the first block on a local rack. It will keep the other two blocks on a different rack. It does not store more than two blocks in the same rack if possible. - -2. MapReduce -MapReduce is the data processing layer of Hadoop. It is a software framework that allows you to write applications for processing a large amount of data. MapReduce runs these applications in parallel on a cluster of low-end machines. It does so in a reliable and fault-tolerant manner. - -MapReduce job comprises a number of map tasks and reduces tasks. Each task works on a part of data. This distributes the load across the cluster. The function of Map tasks is to load, parse, transform and filter data. Each reduce task works on the sub-set of output from the map tasks. Reduce task applies grouping and aggregation to this intermediate data from the map tasks. - -The input file for the MapReduce job exists on HDFS. The inputformat decides how to split the input file into input splits. Input split is nothing but a byte-oriented view of the chunk of the input file. This input split gets loaded by the map task. The map task runs on the node where the relevant data is present. The data need not move over the network and get processed locally. - -Hadoop Architecture - MapReduce - -i. Map Task -The Map task run in the following phases:- - -a. RecordReader -The recordreader transforms the input split into records. It parses the data into records but does not parse records itself. It provides the data to the mapper function in key-value pairs. Usually, the key is the positional information and value is the data that comprises the record. - -b. Map -In this phase, the mapper which is the user-defined function processes the key-value pair from the recordreader. It produces zero or multiple intermediate key-value pairs. - -The decision of what will be the key-value pair lies on the mapper function. The key is usually the data on which the reducer function does the grouping operation. And value is the data which gets aggregated to get the final result in the reducer function. - -c. Combiner -The combiner is actually a localized reducer which groups the data in the map phase. It is optional. Combiner takes the intermediate data from the mapper and aggregates them. It does so within the small scope of one mapper. In many situations, this decreases the amount of data needed to move over the network. For example, moving (Hello World, 1) three times consumes more network bandwidth than moving (Hello World, 3). Combiner provides extreme performance gain with no drawbacks. The combiner is not guaranteed to execute. Hence it is not of overall algorithm. - -d. Partitioner -Partitioner pulls the intermediate key-value pairs from the mapper. It splits them into shards, one shard per reducer. By default, partitioner fetches the hashcode of the key. The partitioner performs modulus operation by a number of reducers: key.hashcode()%(number of reducers). This distributes the keyspace evenly over the reducers. It also ensures that key with the same value but from different mappers end up into the same reducer. The partitioned data gets written on the local file system from each map task. It waits there so that reducer can pull it. - -b. Reduce Task -The various phases in reduce task are as follows: - -i. Shuffle and Sort -The reducer starts with shuffle and sort step. This step downloads the data written by partitioner to the machine where reducer is running. This step sorts the individual data pieces into a large data list. The purpose of this sort is to collect the equivalent keys together. The framework does this so that we could iterate over it easily in the reduce task. This phase is not customizable. The framework handles everything automatically. However, the developer has control over how the keys get sorted and grouped through a comparator object. - -ii. Reduce -The reducer performs the reduce function once per key grouping. The framework passes the function key and an iterator object containing all the values pertaining to the key. - -We can write reducer to filter, aggregate and combine data in a number of different ways. Once the reduce function gets finished it gives zero or more key-value pairs to the outputformat. Like map function, reduce function changes from job to job. As it is the core logic of the solution. - -iii. OutputFormat -This is the final step. It takes the key-value pair from the reducer and writes it to the file by recordwriter. By default, it separates the key and value by a tab and each record by a newline character. We can customize it to provide richer output format. But none the less final data gets written to HDFS. - -Hadoop MapReduce Architecture Diagram - -3. YARN -YARN or Yet Another Resource Negotiator is the resource management layer of Hadoop. The basic principle behind YARN is to separate resource management and job scheduling/monitoring function into separate daemons. In YARN there is one global ResourceManager and per-application ApplicationMaster. An Application can be a single job or a DAG of jobs. - -Inside the YARN framework, we have two daemons ResourceManager and NodeManager. The ResourceManager arbitrates resources among all the competing applications in the system. The job of NodeManger is to monitor the resource usage by the container and report the same to ResourceManger. The resources are like CPU, memory, disk, network and so on. - -The ApplcationMaster negotiates resources with ResourceManager and works with NodeManger to execute and monitor the job. - -Hadoop Architecture - -The ResourceManger has two important components – Scheduler and ApplicationManager - -i. Scheduler -Scheduler is responsible for allocating resources to various applications. This is a pure scheduler as it does not perform tracking of status for the application. It also does not reschedule the tasks which fail due to software or hardware errors. The scheduler allocates the resources based on the requirements of the applications. - -ii. Application Manager -Following are the functions of ApplicationManager - -Accepts job submission. -Negotiates the first container for executing ApplicationMaster. A container incorporates elements such as CPU, memory, disk, and network. -Restarts the ApplicationMaster container on failure. -Functions of ApplicationMaster:- - -Negotiates resource container from Scheduler. -Tracks the resource container status. -Monitors progress of the application. -We can scale the YARN beyond a few thousand nodes through YARN Federation feature. This feature enables us to tie multiple YARN clusters into a single massive cluster. This allows for using independent clusters, clubbed together for a very large job. - -iii. Features of Yarn -YARN has the following features:- - -a. Multi-tenancy - -YARN allows a variety of access engines (open-source or propriety) on the same Hadoop data set. These access engines can be of batch processing, real-time processing, iterative processing and so on. - -b. Cluster Utilization - -With the dynamic allocation of resources, YARN allows for good use of the cluster. As compared to static map-reduce rules in previous versions of Hadoop which provides lesser utilization of the cluster. - -c. Scalability - -Any data center processing power keeps on expanding. YARN’s ResourceManager focuses on scheduling and copes with the ever-expanding cluster, processing petabytes of data. - -d. Compatibility - -MapReduce program developed for Hadoop 1.x can still on this YARN. And this is without any disruption to processes that already work. - -Best Practices For Hadoop Architecture Design -i. Embrace Redundancy Use Commodity Hardware -Many companies venture into Hadoop by business users or analytics group. The infrastructure folks peach in later. These people often have no idea about Hadoop. The result is the over-sized cluster which increases the budget many folds. Hadoop was mainly created for availing cheap storage and deep data analysis. To achieve this use JBOD i.e. Just a Bunch Of Disk. Also, use a single power supply. - -ii. Start Small and Keep Focus -Many projects fail because of their complexity and expense. To avoid this start with a small cluster of nodes and add nodes as you go along. Start with a small project so that infrastructure and development guys can understand the internal working of Hadoop. - -iii. Create Procedure For Data Integration -One of the features of Hadoop is that it allows dumping the data first. And we can define the data structure later. We can get data easily with tools such as Flume and Sqoop. But it is essential to create a data integration process. This includes various layers such as staging, naming standards, location etc. Make proper documentation of data sources and where they live in the cluster. - -iv. Use Compression Technique -Enterprise has a love-hate relationship with compression. There is a trade-off between performance and storage. Although compression decreases the storage used it decreases the performance too. But Hadoop thrives on compression. It can increase storage usage by 80%. - -v. Create Multiple Environments -It is a best practice to build multiple environments for development, testing, and production. As Apache Hadoop has a wide ecosystem, different projects in it have different requirements. Hence there is a need for a non-production environment for testing upgrades and new functionalities. - -Summary -Hence, in this Hadoop Application Architecture, we saw the design of Hadoop Architecture is such that it recovers itself whenever needed. Its redundant storage structure makes it fault-tolerant and robust. We are able to scale the system linearly. The MapReduce part of the design works on the principle of data locality. The Map-Reduce framework moves the computation close to the data. Therefore decreasing network traffic which would otherwise have consumed major bandwidth for moving large datasets. Thus overall architecture of Hadoop makes it economical, scalable and efficient big data technology. - -Hadoop Architecture is a very important topic for your Hadoop Interview. We recommend you to once check most asked Hadoop Interview questions. You will get many questions from Hadoop Architecture. \ No newline at end of file diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" deleted file mode 100644 index 7c6456bafba575d9246b014e810f8c2d5bd50418..0000000000000000000000000000000000000000 Binary files "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt deleted file mode 100644 index 323f777eb57a0e0f4b63e76fcbe42eb58c4cd3fc..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt +++ /dev/null @@ -1,16 +0,0 @@ -highest unit&work&AGGREGATION -MapReduce programming paradigm&Map Stage&依赖 -MapReduce programming paradigm&two-step data analysis process&依赖 -map stage&set&依赖 -set&datum&AGGREGATION -map stage&datum&依赖 -output&map function&AGGREGATION -Reduce job&output&依赖 -Reduce job&map function&依赖 -smaller set&tuple&AGGREGATION -reduce job&sequence&依赖 -sequence&name MapReduce&AGGREGATION -job&several mappers and reducer&依赖 -portion&task&依赖 -portion&job&AGGREGATION -slice&datum&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt deleted file mode 100644 index de93cfefd6ca4e0ff0946228aeae39ee290e4d71..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt +++ /dev/null @@ -1,21 +0,0 @@ -MapReduce -Hadoop MapReduce- Java-based Processing Framework for Big Data -MapReduce rules the roost for massive scale big data processing on Hadoop. The highest unit of work in Hadoop MapReduce is a Job. MapReduce programming paradigm uses a two-step data analysis process- Map Stage and Reduce Stage (reduce phase is optional). The map stage takes a set of data and converts it into another set where data elements are broken down into key-value pairs or tuples. Reduce job takes the output of the map function and combines them into smaller set of tuples or key-value pairs. The reduce job is always performed when the map job is completed - hence the sequence of the name MapReduce. - -MapReduce Overview -MapReduce Terminologies -MapReduce Life Cycle -MapReduce Advantages -MapReduce Blogs -MapReduce Tutorials -MapReduce Interview Questions -MapReduce Slides -MapReduce Videos -MapReduce Questions & Answers -MapReduce Assignments - -MapReduce Terminologies -Job - It is the complete process to execute including the mappers, the input, the reducers and the output across a particular dataset. -Task - Every job is divided into several mappers and reducers. A portion of the job executed on a slice of data can be referred to as a task. -JobTracker - It is the master node for managing all the jobs and resources in a hadoop cluster. -TaskTracker - These are the agents deployed to each machine in the hadoop cluster to run Map and Reduce tasks and then report the status to the JobTracker after execution. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls deleted file mode 100644 index cfdb78508ac5c294f33e735f8cb2a6f3616d7424..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce.xls new file mode 100644 index 0000000000000000000000000000000000000000..b28fcdd7b1710bf6526eeb5ca134cd8680467ac4 Binary files /dev/null and b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Hadoop MapReduce.xls differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt deleted file mode 100644 index 57cac498c18d1631d83e4d12d33a1177b7ebbb26..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt +++ /dev/null @@ -1,107 +0,0 @@ -Introduction&large set&依赖 -Introduction&datum&依赖 -large set&datum&AGGREGATION -Google&that&依赖 -MapReduce&picture&依赖 -huge chunk&datum&AGGREGATION -huge amount&datum&AGGREGATION -parallel processing&huge amount&AGGREGATION -Map Reduce program&programmer&依赖 -they&MapReduce&依赖 -development&applications and deployment&AGGREGATION -development&programmer&依赖 -flow pattern&MapReduce&AGGREGATION -they&flow pattern&依赖 -Explanation&Python and C++&依赖 -Explanation&MapReduce Architecture Hadoop&AGGREGATION -Explanation&programming language&依赖 -application&software processing huge amount&AGGREGATION -software processing huge amount&datum&AGGREGATION -framework&task&依赖 -chunk&datum&AGGREGATION -framework&chunk&依赖 -a file-system store&work and input&依赖 -work and input&job&AGGREGATION -a file-system store&job&依赖 -Re-execution&framework&依赖 -Re-execution&framework&依赖 -task&framework&AGGREGATION -Re-execution&failed task&AGGREGATION -architecture&two main processing stage&依赖 -architecture&MapReduce&AGGREGATION -MapReduce&Job tracker&依赖 -Intermediate process&place&依赖 -local file system store&intermediate datum&依赖 -take&other datum&依赖 -certain number&output&AGGREGATION -take&it&依赖 -take&set&依赖 -breakdown&individual element&AGGREGATION -take&it&依赖 -take&set&依赖 -take&other datum&依赖 -set&other datum&AGGREGATION -Mappers output&reduction&依赖 -single mapper&reduced function&依赖 -new output value&hdf&依赖 -MapReduce architecture&architecture&GENERALIZATION -MapReduce Architecture Components Below&component&依赖 -component&MapReduce architecture&AGGREGATION -MapReduce Architecture Components Below&MapReduce architecture&依赖 -explanation&component&AGGREGATION -Map Phase Map phase&two part&依赖 -Map Phase Map phase&input datum&依赖 -Value&processing stage&依赖 -Let ’s&input datum&依赖 -Key-value pair conversion&record reader&依赖 -Key-value pair conversion&input datum&依赖 -piece&data format and code&AGGREGATION -reducer code place input&reducer code place input&依赖 -reducer code place input&combiner&依赖 -reducer code place input&reducer code place input&依赖 -reducer code place input&combiner&依赖 -partition module&key role&依赖 -partition module&Hadoop&依赖 -map input&sort and shuffle phase&依赖 -intermediate datum&local file system&依赖 -Hadoop node&replication&依赖 -Reducer Phase&data input&依赖 -Reducer Phase&data input&依赖 -reducer&searching&依赖 -number&reducer&AGGREGATION -speculative execution&job processing&依赖 -speculative execution&prominent role&依赖 -task&run&依赖 -more than one mapper&similar datum&依赖 -task&next mapper&依赖 -task&fast program&依赖 -| Verifiable Certificate&Access&AGGREGATION -job&two component&依赖 -job&split&依赖 -job&Map task&依赖 -complete execution&given job&AGGREGATION -Conclusion&document&依赖 -lot&document&AGGREGATION -Conclusion&lot&依赖 -you&number&依赖 -number&word&AGGREGATION -occurrence&word&AGGREGATION -number&occurrence&AGGREGATION -you&word&依赖 -you&lot&依赖 -lot&web page&AGGREGATION -you&web page&依赖 -them&search query&依赖 -I&arbitrary task&依赖 -reducer&datum&依赖 -reducer&aggregation&依赖 -it&key&依赖 -aggregation&datum&AGGREGATION -Recommended Articles This&MapReduce Architecture&依赖 -component&architecture&AGGREGATION -we&explanation&依赖 -we&MapReduce Architecture&依赖 -we&component&依赖 -we&introduction&依赖 -You&more –&依赖 -our&articles& diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt deleted file mode 100644 index c08e74f85bc6402cf53cb874c6a6c2c095431aeb..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt +++ /dev/null @@ -1,48 +0,0 @@ -Introduction to MapReduce Architecture -Hadoop cluster stores a large set of data which is parallelly processed mainly by MapReduce. Firstly, it was just a thesis that Google designed. That provides parallelism, fault-tolerance, and data distribution. For processing huge chunks of data, MapReduce comes into the picture. Map Reduce provides API with features such as parallel processing of huge amounts of data, batch processing, and high availability. Map Reduce programs are written by programmers when there is a need for an application for business scenarios. The development of applications and deployment across Hadoop clusters is done by the programmers when they understand the flow pattern of MapReduce. - -Explanation of MapReduce Architecture -Hadoop can be developed in programming languages like Python and C++. MapReduce Hadoop is a software framework for ease in writing applications of software processing huge amounts of data. MapReduce is a framework which splits the chunk of data, sorts the map outputs and input to reduce tasks. A File-system stores the work and input of jobs. Re-execution of failed tasks, scheduling them, and monitoring them is the task of the framework. - -The architecture of MapReduce basically has two main processing stages, and those are Map and Reduce. The MapReduce happens in Job tracker. Intermediate processes will take place in between the Map and Reduce phases. Sort and shuffle are the tasks taken up by Map and Reduce, which are done intermediate. The local file system stores the intermediate data. - -Map() Function: Create and process the import data. Takes in data, converts it into a set of other data where the breakdown of individual elements into tuples is done—no API contract requiring a certain number of outputs. -Reduce() Function: Mappers output is passed into the reduction. Processes the data into something usable. Every single mapper is passed into the reduced function. The new output values are saved into HDFS. -MapReduce Architecture Components -Below is the explanation of components of MapReduce architecture: - -MapReduce1 -1. Map Phase -Map phase splits the input data into two parts. They are Keys and Values. Writable and comparable is the key in the processing stage where only in the processing stage, Value is writable. Let’s say a client gives input data to a Hadoop system; task tracker is assigned tasks by job tracker. Splitting of input is done into several inputs. Key-value pair conversion is done with the input data by the record reader. This is the actual data input for Map as in mapped information for further processing. The format type varies, so the coder has to look into each piece of data format and code accordingly. - -Mini reducer which is commonly called a combiner, the reducer code places input as the combiner. Network bandwidth is high when a huge amount of data is required. Hash is the default partition used. The partition module plays a key role in Hadoop. More performance is given by reducing the pressure by petitioner on the reducer. - -2. Processing in Intermediate -In the intermediate phase, the map input gets into the sort and shuffle phase. Hadoop nodes do not have replications where all the intermediate data is stored in a local file system. Round – robin data is used by Hadoop to write to local disk, the intermediate data. There are other shuffles and sort factors to be considered to reach the condition of writing the data to local disks. - -3. Reducer Phase -The reducer takes in the data input that is sorted and shuffled. All the input data will be combined, and similar key-value pairs are to be written to the hdfs system. For searching and mapping purposes, a reducer is not always necessary. Setting some properties for enabling to develop of the number of reducers for each task. During job processing, speculative execution plays a prominent role. The performance is FIFO that is first in first out, and if more than one mapper is working on similar data, and if one is running slow, then the tasks are assigned to the next mapper for a fast program run by the job tracker. - - Popular Course in this category -MapReduce Training (2 Courses, 4+ Projects) -2 Online Courses | 4 Hands-on Projects | 19+ Hours | Verifiable Certificate of Completion | Lifetime Access -4.5 (6,176 ratings)Course Price -$79 $399 -View Course -Related Courses -Data Scientist Training (76 Courses, 60+ Projects)Machine Learning Training (17 Courses, 27+ Projects)Hadoop Training Program (20 Courses, 14+ Projects, 4 Quizzes) -MapReduce2 -This is how MapReduce organizers work. - -The job is divided into two components: Map tasks (Splits and mapping) and Reduce tasks (Reducing and shuffling). -The above picture says that Job tracker is associated with complete execution of a given job, by behaving like a master. Whereas, the Multiple task trackers act like slaves by performing the job each. -Conclusion -Imagine you have lots of documents, which is huge data. And you need to count the number of occurrences of each word throughout the documents. I might seem like an arbitrary task, but the basic idea is that let’s say you have a lot of web pages and you want to make them available for search queries. The reducer does aggregation of data, and it consists of all the keys and combines them all for similar key-value pairs which is basically the Hadoop shuffling process. - -Recommended Articles -This is a guide to MapReduce Architecture. Here we discuss an introduction to MapReduce Architecture, explanation of components of the architecture in detail. You can also go through our other related articles to learn more – - -Mapreduce Combiner -How MapReduce Works -What is MapReduce in Hadoop? -MapReduce Algorithms diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls deleted file mode 100644 index baa4ea40dea9c37485d5dd346501ceb9d73e882c..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt deleted file mode 100644 index 4afd378239a129f057a052ebbc7c28e787d16417..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt +++ /dev/null @@ -1,117 +0,0 @@ -a programming model and expectation&Hadoop&依赖 -underlying system&care&依赖 -Input&multiple chunks/blocks&依赖 -chunk/block&different node&依赖 -chunk/block&datum&AGGREGATION -MapReduce architecture&phase&依赖 -input files inputformat inputsplit recordreader mapper combiner partitioner shuffling&Reducer RecordWriter OutputFormat Input Files&依赖 -input datum&input file&依赖 -input file&hdf ( hadoop&依赖 -input file&file system )&依赖 -format&file&AGGREGATION -InputFormat - InputFormat&Map-Reduce job&依赖 -InputFormat - InputFormat&input-specification&依赖 -InputFormat&file or other object&依赖 -InputFormat&InputSplit&依赖 -InputFormat&selected input file&依赖 -InputFormat&byte&依赖 -InputFormat&input&依赖 -InputFormat&input file&依赖 -InputFormat&input&依赖 -byte&input file&AGGREGATION -InputSplit - InputSplit&InputFormat&依赖 -InputSplit&datum&依赖 -number&InputSplits&依赖 -number&map task&AGGREGATION -number&InputSplits&AGGREGATION -number&number&依赖 -number&InputSplits&依赖 -number&number&依赖 -record&specific mapper&依赖 -InputSplit&input record&依赖 -InputSplit&input&依赖 -InputSplit&byte-oriented view&依赖 -RecordReader - RecordReader&InputSplit&依赖 -RecordReader - RecordReader&Hadoop MapReduce&依赖 -RecordReader&InputSplit&依赖 -RecordReader&< key , value > pair&依赖 -RecordReader&InputSplit&依赖 -byte-oriented view&input&AGGREGATION -RecordReader&byte-oriented view&依赖 -RecordReader&input&依赖 -record-oriented view&input datum&AGGREGATION -RecordReader&datum&依赖 -RecordReader&key-value pair&依赖 -RecordReader&InputSplit&依赖 -key-value pair&further processing&依赖 -key-value pair&mapper&依赖 -Mapper - Mapper&input record&依赖 -input record&record&GENERALIZATION -mapper output&as intermediate output&依赖 -mapper output&local disk&依赖 -mapper output&output&GENERALIZATION -it&unnecessary copy&依赖 -Mappers output&combiner&依赖 -Mappers output&further process&依赖 -Mappers output&output&GENERALIZATION -Map&set&依赖 -set&datum&AGGREGATION -Map&datum&依赖 -Mapper&datum&依赖 -form&key/value pair&AGGREGATION -Mapper&key/value pair&依赖 -Mapper&form&依赖 -Combiner&map task&依赖 -output&map task&AGGREGATION -Combiner&output&依赖 -combiner&local reducer&依赖 -Hadoop&combiner function&依赖 -Hadoop&map output&依赖 -Hadoop&one or many time&依赖 -map output&output&GENERALIZATION -how output&reducer&依赖 -how output&reducer&依赖 -Partitioner&keys partition&依赖 -Partitioner&intermediate map-output&依赖 -keys partition&intermediate map-output&AGGREGATION -key&key&AGGREGATION -number&job&依赖 -number&reduce task&依赖 -number&of&AGGREGATION -total number&partition&AGGREGATION -its&execution& -Partitioner&same machine&依赖 -mapper&execution&依赖 -partitioner form number&partitioner form number&依赖 -partitioner form number&reduce task group&依赖 -partitioner form number&reduce task group&AGGREGATION -Hadoop framework&hash base partitioner&依赖 -Hadoop framework&default&依赖 -Hadoop framework&default&依赖 -hash partitioner partition&key space&依赖 -hash partitioner partition&key space&依赖 -output&partitioner&AGGREGATION -physical movement&datum&AGGREGATION -shuffling&network&依赖 -shuffling&datum&依赖 -mapper&process&依赖 -output produce&reducer node&依赖 -their&process& -intermediate value&list&依赖 -reducer task&mapper&依赖 -reducer task&output&依赖 -reducer task&input&依赖 -smaller set&tuple&AGGREGATION -their&lists& -intermediate key&reducer&依赖 -intermediate key&sorted key order&依赖 -reducer&zero or more final key/value pair&依赖 -RecordWriter & OutputFormat&Reducer phase&依赖 -RecordWriter & OutputFormat&output file&依赖 -RecordWriter & OutputFormat&output key-value pair&依赖 -output key-value pair&key-value pair&GENERALIZATION -Reducer phase&phase&GENERALIZATION -way&OutputFormat&依赖 -final output&OutputFormat instance&依赖 -final output&hdf&依赖 -final output&reducer&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt deleted file mode 100644 index c29fcc12b99fe2dbf843cee6167124c5029762d0..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt +++ /dev/null @@ -1,72 +0,0 @@ -MapReduce is a programming model and expectation is parallel processing in Hadoop. MapReduce makes easy to distribute tasks across nodes and performs Sort or Merge based on distributed computing. - -The underlying system takes care of partitioning input data, scheduling the programs execution across several machines, handling machine failures and managing inter-machine communication. - -Input will be divided into multiple chunks/blocks. Each and every chunk/block of data will be processed in different nodes. MapReduce architecture contains the below phases - - -Input Files -InputFormat -InputSplit -RecordReader -Mapper -Combiner -Partitioner -Shuffling and Sorting -Reducer -RecordWriter -OutputFormat -Input Files - -In general, the input data to process using MapReduce task is stored in input files. These input files typically reside in HDFS (Hadoop Distributed File System). The format of these files is random where other formats like binary or log files can also be used. - -InputFormat - -InputFormat describes the input-specification for a Map-Reduce job. InputFormat defines how the input files are to split and read. InputFormat selects the files or other objects used for input. - -InputFormat creates InputSplit from the selected input files. InputFormat split the input into logical InputSplits based on the total size, in bytes of the input files. - -InputSplit - -InputSplit is created by InputFormat. InputSplit logically represents the data to be processed by an individual Mapper. One map task is created to process one InputSplit. - -The number of map tasks normally equals to the number of InputSplits. The InputSplit is divided into input records and each record is processed by the specific mapper assigned to process the InputSplit. InputSplit presents a byte-oriented view on the input. - -RecordReader - -RecordReader communicates with the InputSplit in Hadoop MapReduce. RecordReader reads pairs from an InputSplit. RecordReader converts the byte-oriented view of the input from the InputSplit. - -RecordReader provides a record-oriented view of the input data for mapper and reducer tasks processing. RecordReader converts the data into key-value pairs suitable for reading by the mapper. - -RecordReader communicates with the InputSplit until the file reading is not completed. Once the file reading completed, these key-value pairs are sent to the mapper for further processing. - -Mapper - -Mapper processes each input record and generates new key-value pair. Mapper generated key-value pair is completely different from the input key-value pair. The mapper output is called as intermediate output. - -The mapper output is not written to local disk because of it creates unnecessary copies. Mappers output is passed to the combiner for further process. - -Map takes a set of data and converts it into another set of data, where individual elements are broken down into key pairs. The Mapper reads the data in the form of key/value pairs and outputs zero or more key/value pairs. - -Combiner - -Combiner acts as a mini reducer in MapReduce framework. This is an optional class provided in MapReduce driver class. Combiner process the output of map tasks and sends it to the Reducer. - -For every mapper, there will be one Combiner. Combiners are treated as local reducers. Hadoop does not provide any guarantee on combiner’s execution. - -Hadoop may not call combiner function if it is not required. Hadoop may call one or many times for a map output based on the requirement. - -Partitioner - -Partitioner allows distributing how outputs from the map stage are send to the reducers. Partitioner controls the keys partition of the intermediate map-outputs. The key or a subset of the key is used to derive the partition by a hash function. - -The total number of partitions is almost same as the number of reduce tasks for the job. Partitioner runs on the same machine where the mapper had completed its execution by consuming the mapper output. Entire mapper output sent to partitioner. - -Partitioner forms number of reduce task groups from the mapper output. By default, Hadoop framework is hash based partitioner. The Hash partitioner partitions the key space by using the hash code. - -Shuffling and Sorting - -The output of the partitioner is Shuffled to the reduce node. The shuffling is the physical movement of the data over the network. Once the mappers finished their process, the output produced are shuffled on reducer nodes. - -The mapper output is called as intermediate output and it is merged and then sorted. The sorted output is provided as a input to the reducer phase. - -Reducer - -After the map phase is over, all the intermediate values for the intermediate keys are combined into a list. Reducer task, which takes the output from a mapper as an input and combines those data tuples into a smaller set of tuples. There may be single reducer, multiple reducers. - -All the values associated with an intermediate key are guaranteed to go to the same reducer. The intermediate key and their value lists are passed to the reducer in sorted key order. The reducer outputs zero or more final key/value pairs and these are written to HDFS. - -RecordWriter & OutputFormat - -RecordWriter writes these output key-value pair from the Reducer phase to the output files. The way of writing the output key-value pairs to output files by RecordWriter is determined by the OutputFormat. - -OutputFormat instances provided by the Hadoop are used to write files in HDFS or on the local disk. The final output of reducer is written on HDFS by OutputFormat instances. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls deleted file mode 100644 index ac66005418b2b4a5b401b9a494edc4774ff401ce..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt deleted file mode 100644 index ec67674ebb9e0c48052a55f9d5f847dec7fddfbc..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt +++ /dev/null @@ -1,71 +0,0 @@ -two major component&Hadoop&AGGREGATION -2020 MapReduce and HDFS&Hadoop&依赖 -library&various different-different optimization&依赖 -library&many programming language&依赖 -it&equivalent task&依赖 -purpose&MapReduce&AGGREGATION -it&it&依赖 -MapReduce task&two phase map phase&依赖 -who&Job&依赖 -Components&MapReduce Architecture&AGGREGATION -who&MapReduce&依赖 -who&processing&依赖 -multiple client&job&依赖 -multiple client&job for ###&依赖 -multiple client&job&依赖 -multiple client&job for ###&依赖 -client&that&依赖 -Hadoop MapReduce Master&particular job&依赖 -Hadoop MapReduce Master&subsequent job-part&依赖 -result&final output&依赖 -result&job-part&AGGREGATION -final result&processing&依赖 -we&MapReduce&依赖 -we&client&依赖 -client&Hadoop MapReduce Master&依赖 -client&job&依赖 -job&particular size&AGGREGATION -client&particular size&依赖 -MapReduce master&job&依赖 -MapReduce master&further equivalent job-part&依赖 -Map and Reduce task&use-case&依赖 -requirement&use-case&AGGREGATION -Map and Reduce task&requirement&依赖 -developer&their logic&依赖 -developer&requirement&依赖 -industry&that&依赖 -their&logic& -input datum&Map Task&依赖 -its&output& -Map&intermediate key-value pair&依赖 -Map&output&依赖 -we&which&依赖 -key-value pair&Reducer&依赖 -final output&hdf&依赖 -output&map i.e.&AGGREGATION -n number&Map and Reduce task&AGGREGATION -algorithm&minimum&依赖 -algorithm&optimized way&依赖 -’s&MapReduce phase&依赖 -’s&architecture&依赖 -its&architecture& -’s&better understanding&依赖 -better understanding&architecture&AGGREGATION -MapReduce task&2 phase i.e. map phase&依赖 -its&use& -key&kind&依赖 -id&kind&AGGREGATION -kind&address and value&AGGREGATION -key&address and value&依赖 -its&repository& -Map ( ) function&memory repository&依赖 -its&pair& -work&Job tracker&AGGREGATION -hundred&data node&AGGREGATION -work&resource&依赖 -Task Tracker&actual slave&依赖 -cluster&Map and Reduce task&依赖 -also one important component&MapReduce Architecture&AGGREGATION -daemon process&task or application&依赖 -daemon process&historical information&依赖 -log&Job History Server&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt deleted file mode 100644 index 794ecd1aeec1f713493b7a669dd4e73d917e708e..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt +++ /dev/null @@ -1,32 +0,0 @@ -MapReduce Architecture -Last Updated : 10 Sep, 2020 -MapReduce and HDFS are the two major components of Hadoop which makes it so powerful and efficient to use. MapReduce is a programming model used for efficient processing in parallel over large data-sets in a distributed manner. The data is first split and then combined to produce the final result. The libraries for MapReduce is written in so many programming languages with various different-different optimizations. The purpose of MapReduce in Hadoop is to Map each of the jobs and then it will reduce it to equivalent tasks for providing less overhead over the cluster network and to reduce the processing power. The MapReduce task is mainly divided into two phases Map Phase and Reduce Phase. - -MapReduce Architecture: - -MapReduce-Architecture - -Components of MapReduce Architecture: -Client: The MapReduce client is the one who brings the Job to the MapReduce for processing. There can be multiple clients available that continuously send jobs for processing to the Hadoop MapReduce Manager. -Job: The MapReduce Job is the actual work that the client wanted to do which is comprised of so many smaller tasks that the client wants to process or execute. -Hadoop MapReduce Master: It divides the particular job into subsequent job-parts. -Job-Parts: The task or sub-jobs that are obtained after dividing the main job. The result of all the job-parts combined to produce the final output. -Input Data: The data set that is fed to the MapReduce for processing. -Output Data: The final result is obtained after the processing. -In MapReduce, we have a client. The client will submit the job of a particular size to the Hadoop MapReduce Master. Now, the MapReduce master will divide this job into further equivalent job-parts. These job-parts are then made available for the Map and Reduce Task. This Map and Reduce task will contain the program as per the requirement of the use-case that the particular company is solving. The developer writes their logic to fulfill the requirement that the industry requires. The input data which we are using is then fed to the Map Task and the Map will generate intermediate key-value pair as its output. The output of Map i.e. these key-value pairs are then fed to the Reducer and the final output is stored on the HDFS. There can be n number of Map and Reduce tasks made available for processing the data as per the requirement. The algorithm for Map and Reduce is made with a very optimized way such that the time complexity or space complexity is minimum. - -Let’s discuss the MapReduce phases to get a better understanding of its architecture: - - - -The MapReduce task is mainly divided into 2 phases i.e. Map phase and Reduce phase. - -Map: As the name suggests its main use is to map the input data in key-value pairs. The input to the map may be a key-value pair where the key can be the id of some kind of address and value is the actual value that it keeps. The Map() function will be executed in its memory repository on each of these input key-value pairs and generates the intermediate key-value pair which works as input for the Reducer or Reduce() function. - -Reduce: The intermediate key-value pairs that work as input for Reducer are shuffled and sort and send to the Reduce() function. Reducer aggregate or group the data based on its key-value pair as per the reducer algorithm written by the developer. -How Job tracker and the task tracker deal with MapReduce: - -Job Tracker: The work of Job tracker is to manage all the resources and all the jobs across the cluster and also to schedule each map on the Task Tracker running on the same data node since there can be hundreds of data nodes available in the cluster. - -Task Tracker: The Task Tracker can be considered as the actual slaves that are working on the instruction given by the Job Tracker. This Task Tracker is deployed on each of the nodes available in the cluster that executes the Map and Reduce task as instructed by Job Tracker. -There is also one important component of MapReduce Architecture known as Job History Server. The Job History Server is a daemon process that saves and stores historical information about the task or application, like the logs which are generated during or after the job execution are stored on Job History Server. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls deleted file mode 100644 index c3dc202614b46ce1876f001c18840b28abe7485d..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt deleted file mode 100644 index f69f018e1c74d0864b47d113a689c57ed421f90a..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt +++ /dev/null @@ -1,1118 +0,0 @@ -MapReduce framework&framework&GENERALIZATION -Purpose This document&user-facing facet&依赖 -Hadoop MapReduce framework&MapReduce framework&GENERALIZATION -Purpose This document&Hadoop MapReduce framework&依赖 -user-facing facet&Hadoop MapReduce framework&AGGREGATION -single node setup&first-time user&依赖 -single node setup&first-time user&依赖 -application&large cluster&依赖 -vast amount and multi-terabyte data-set&datum&AGGREGATION -application&commodity hardware&依赖 -application&vast amount and multi-terabyte data-set&依赖 -large cluster&commodity hardware&AGGREGATION -thousand&node&AGGREGATION -MapReduce job&input data-set&依赖 -MapReduce job&job&GENERALIZATION -MapReduce job&independent chunk&依赖 -output&map&AGGREGATION -framework sort&output&依赖 -framework sort&map&依赖 -input&file-system&依赖 -output&job&AGGREGATION -care&scheduling task&AGGREGATION -framework&care&依赖 -framework&scheduling task&依赖 -file system (&same set&依赖 -file system (&same set&依赖 -same set&node&AGGREGATION -file system (&node&依赖 -file system (&node&依赖 -configuration&resulting&依赖 -configuration&framework&依赖 -configuration&task&依赖 -MapReduce framework&single master JobTracker&依赖 -jobs&tasks& -slave&task&依赖 -implementation&appropriate interfaces and/or abstract-class&AGGREGATION -application&input/output locations and supply map&依赖 -job configuration&configuration&GENERALIZATION -Hadoop job client&JobTracker&依赖 -JobTracker&responsibility&依赖 -Hadoop job client&job ( jar/executable etc. ) and configuration&依赖 -JobTracker&job-client&依赖 -JobTracker&software/configuration&依赖 -Hadoop framework&JavaTM&实现 -MapReduce application&Java&依赖 -Hadoop framework&framework&GENERALIZATION -mapper&reducer&依赖 -Hadoop Pipes&compatible C++ API&依赖 -set&< key , value > pair&AGGREGATION -job&different type&AGGREGATION -Inputs and output&input&依赖 -Inputs and output&input&依赖 -key class&WritableComparable interface&实现 -input and output type&MapReduce job&AGGREGATION -we&detail&依赖 -MapReduce application&application&GENERALIZATION -simple application&number&依赖 -simple application&word&依赖 -number&word&AGGREGATION -number&occurence&AGGREGATION -occurence&word&AGGREGATION -simple application&number&依赖 -simple application&word&依赖 -source code wordcount.java 1&source code wordcount.java 1&依赖 -public class wordcount { 13&public class wordcount { 13&依赖 -public void map&IOException { 19&依赖 -public void map&IOException { 19&依赖 -public void map&IOException { 19&依赖 -public void map&IOException { 19&依赖 -while ( tokenizer.hasmoretokens&) ) { 22&依赖 -public void reduce&IOException { 30&依赖 -public void reduce&IOException { 30&依赖 -public void reduce&IOException { 30&依赖 -public void reduce&IOException { 30&依赖 -while ( values.hasnext ( ) ) { 32&while ( values.hasnext ( ) ) { 32&依赖 -public static void main ( string [ ] arg&Exception { 39&依赖 -root&installation and hadoop_version be&AGGREGATION -/&input&GENERALIZATION -/&output&GENERALIZATION -input&/&GENERALIZATION -cat / usr/joe/wordcount / output/part -00000 Bye 1 Goodbye 1 Hadoop 2 Hello 2 World 2 application&comma separated list&依赖 -cat / usr/joe/wordcount / output/part -00000 Bye 1 Goodbye 1 Hadoop 2 Hello 2 World 2 application&path&依赖 -comma separated list&path&AGGREGATION -current working directory&task&AGGREGATION -classpath&map&AGGREGATION -- libjars option&application&依赖 -- libjars option&jar&依赖 -archive&argument&依赖 -archive&comma separated list&依赖 -comma separated list&archive&AGGREGATION -archive&them&依赖 -link&task&依赖 -link¤t working directory&依赖 -name&archive&AGGREGATION -More detail&Commands Guide&依赖 -More detail&Commands Guide&依赖 -myarchive.zip&directory&依赖 -myarchive.zip&name " myarchive.zip "&依赖 -user&##&依赖 -user&different symbolic name&依赖 -txt&task&依赖 -txt&example&依赖 -txt&symbolic name dict1 and dict2&依赖 -archive mytar.tgz&directory&依赖 -archive mytar.tgz&name " tgzdir "&依赖 -mapper implementation ( line&time&依赖 -mapper implementation ( line&one line&依赖 -mapper implementation ( line&map method ( line&依赖 -It&line&依赖 -key-value pair&< , 1 >&AGGREGATION -It&token&依赖 -We&more&依赖 -We&map&依赖 -number&map&AGGREGATION -We&number&依赖 -WordCount&a combiner ( line&依赖 -output&local combiner (&依赖 -output&same&依赖 -output&local aggregation&依赖 -2 > The output&second map&AGGREGATION -output&value&依赖 -output&first map&AGGREGATION -< Bye&JobConf&依赖 -run method&various facet&依赖 -run method&job&依赖 -< Bye&JobConf&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -various facet&job&AGGREGATION -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -command line&line&GENERALIZATION -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -run method&input/output path&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -< Bye&input/output format etc.&依赖 -< Bye&input/output format etc.&依赖 -< Bye&JobConf&依赖 -< Bye&JobConf&依赖 -It&and&依赖 -It&jobclient.runjob ( line&依赖 -its&progress& -We&JobConf&依赖 -We&more&依赖 -We&JobConf&依赖 -reasonable amount&detail&AGGREGATION -user-facing aspect&MapReduce framework&AGGREGATION -their&jobs& -us&Mapper and Reducer interface&依赖 -application&them&实现 -We&JobConf , JobClient , Partitioner , OutputCollector , Reporter , InputFormat , OutputFormat , OutputCommitter and other&依赖 -We&other core interface&依赖 -useful feature&framework&AGGREGATION -payload application&Mapper and Reducer interface&实现 -core&job&AGGREGATION -Mapper Mapper&set&依赖 -set&intermediate key/value pair&AGGREGATION -Mapper Mapper&input key/value pair&依赖 -Mapper Mapper&intermediate key/value pair&依赖 -individual task&input record&依赖 -individual task&intermediate record&依赖 -given input pair&zero or many output pair&依赖 -Hadoop MapReduce framework&one map task&依赖 -Hadoop MapReduce framework&InputSplit&依赖 -map task&task&GENERALIZATION -framework&task&依赖 -framework&map ( writablecomparable , writable , outputcollector , reporter )&依赖 -framework&key/value pair&依赖 -application&required cleanup&依赖 -application&Closeable.close ( ) method&依赖 -Output pair&call&依赖 -Output pair&outputcollector.collect ( writablecomparable , writable )&依赖 -application&Reporter&依赖 -intermediate value&framework&依赖 -user&grouping&依赖 -number&job&依赖 -number&reduce task&依赖 -number&of&AGGREGATION -total number&partition&AGGREGATION -intermediate output&datum&依赖 -amount&datum&AGGREGATION -user&combiner&依赖 -user&jobconf.setcombinerclass ( class )&依赖 -local aggregation&intermediate output&AGGREGATION -intermediate output&amount&依赖 -intermediate , sorted output&value ) format a simple ( key-len&依赖 -intermediate , sorted output&a simple ( key-len&依赖 -intermediate , sorted output&value ) format&实现 -intermediate , sorted output&a simple ( key-len&实现 -CompressionCodec&JobConf&依赖 -block&input file&AGGREGATION -total number&block&AGGREGATION -number&input&依赖 -number&total size&依赖 -total size&input&AGGREGATION -right level¶llelism&AGGREGATION -map&execute&依赖 -map&minute&依赖 -10TB&input datum&AGGREGATION -you&input datum&依赖 -blocksize&128MB&AGGREGATION -you&10TB&依赖 -Reducer Reducer&set&依赖 -smaller set&value&AGGREGATION -intermediate value&key&依赖 -Reducer Reducer&intermediate value&依赖 -intermediate value&value&依赖 -intermediate value&smaller set&依赖 -set&intermediate value&AGGREGATION -Reducer Reducer&Reducer&GENERALIZATION -number&reduce&AGGREGATION -number&user&依赖 -number&jobconf.setnumreducetasks ( int )&依赖 -( list&value&AGGREGATION -framework&( writablecomparable , iterator , outputcollector , reporter ) method&依赖 -Reducer&3 primary phase&依赖 -Reducer&shuffle&依赖 -Shuffle Input&mapper&依赖 -sorted output&mapper&AGGREGATION -Shuffle Input&mapper&依赖 -framework&HTTP&依赖 -framework&relevant partition&依赖 -framework&HTTP&依赖 -output&mapper&AGGREGATION -framework&output&依赖 -relevant partition&output&AGGREGATION -framework&relevant partition&依赖 -framework&output&依赖 -different mapper&same key )&依赖 -one&jobconf.setoutputvaluegroupingcomparator ( class )&依赖 -one&Comparator&依赖 -output&FileSystem&依赖 -output&outputcollector.collect ( writablecomparable , writable )&依赖 -output&reduce task&AGGREGATION -output&Reducer&AGGREGATION -right number&reduce&AGGREGATION -their&round& -faster node&reduce&依赖 -faster node&reduce&依赖 -faster node&first round&依赖 -better job&load balancing&AGGREGATION -faster node&first round&依赖 -first round&reduce&AGGREGATION -cost&failure&AGGREGATION -scaling factor&a few reduce slot&依赖 -scaling factor&speculative-task&依赖 -number&reduce-task&AGGREGATION -output&output path&依赖 -output path&path&GENERALIZATION -output&case&依赖 -output&set&依赖 -output&output path&依赖 -output&set&依赖 -output&case&依赖 -output&FileSystem&依赖 -output&FileSystem&依赖 -output&map-task&AGGREGATION -framework&map-output&依赖 -Partitioner Partitioner&Partitioner&GENERALIZATION -Partitioner Partitioner&key space&依赖 -Partitioner&key&依赖 -key&intermediate map-output&AGGREGATION -Partitioner&intermediate map-output&依赖 -partitioning&key&AGGREGATION -Partitioner&partitioning&依赖 -subset&key )&AGGREGATION -this control&task&依赖 -intermediate key (&for reduction&依赖 -Reporter Reporter&MapReduce application&依赖 -Reporter Reporter&Reporter&GENERALIZATION -Reporter Reporter&progress&依赖 -Mapper and Reducer implementation&Reporter&依赖 -application&time&依赖 -significant amount&time&AGGREGATION -application&significant amount&依赖 -framework&task&依赖 -way&configuration parameter mapred.task.timeout&依赖 -application&counter&依赖 -application&Reporter&依赖 -generalization&facility&AGGREGATION -OutputCollector OutputCollector&facility&依赖 -output&job )&AGGREGATION -library&useful mapper&AGGREGATION -MapReduce job configuration&job configuration&GENERALIZATION -Job Configuration JobConf&MapReduce job configuration&依赖 -JobConf&user&依赖 -framework&job&依赖 -other parameter&rest&依赖 -other parameter&framework and/or job configuration&依赖 -job parameter&setnumreducetasks ( and int ) )&依赖 -rest&framework and/or job configuration&AGGREGATION -JobConf&input file&依赖 -JobConf&set&依赖 -set&input file&AGGREGATION -JobConf&( jobconf , path&依赖 -job task&a speculative manner ( setmapspeculativeexecution&依赖 -percentage&tasks failure&AGGREGATION -other advanced facet&job&AGGREGATION -maximum number&attempt&AGGREGATION -user&course&依赖 -user&/&依赖 -large amount&( read-only ) datum&AGGREGATION -child process&process&GENERALIZATION -TaskTracker&Mapper / Reducer task&依赖 -TaskTracker&child process&依赖 -TaskTracker&separate jvm&依赖 -parent TaskTracker&TaskTracker&GENERALIZATION -environment&parent TaskTracker&AGGREGATION -child-task&environment&依赖 -child-task&parent TaskTracker&依赖 -user&mapr&依赖 -user&child-jvm&依赖 -user&additional option&依赖 -{ map | reduce }&{ map | reduce }&依赖 -configuration parameter¶meter&GENERALIZATION -value&taskid&AGGREGATION -taskid&MapReduce task&AGGREGATION -MapReduce task&task&GENERALIZATION -it&jconsole&依赖 -start&passwordless JVM JMX agent&AGGREGATION -It&maximum heap-size&依赖 -It&map&依赖 -maximum heap-size&map&AGGREGATION -It&child-jvm&依赖 -It&additional path&依赖 -It&java.library.path&依赖 -java.library.path&child-jvm&AGGREGATION -Memory Management Users/admins&maximum virtual memory&依赖 -maximum virtual memory&launched child-task&AGGREGATION -Memory Management Users/admins&launched child-task&依赖 -child.ulimit&kilo byte kb )&依赖 -Environment&Hadoop Daemons&AGGREGATION -part&framework&AGGREGATION -datum&disk&依赖 -datum&frequency&依赖 -concurrency&operation&AGGREGATION -tuning¶meter&AGGREGATION -default limit&Virtual Memory&AGGREGATION -user&Virtual Memory&依赖 -user&default limit&依赖 -user¶meter&依赖 -user&job&依赖 -name type description mapred.task.maxvmem&number&依赖 -task&job&AGGREGATION -name type description mapred.task.maxvmem&byte&依赖 -it&number&依赖 -it&more Virtual Memory&依赖 -mapred.task.maxpmem int&number&依赖 -mapred.task.maxpmem int&byte&依赖 -over-scheduling&task&AGGREGATION -a buffer and metada&accounting buffer&依赖 -Map parameter&a buffer and metada&依赖 -contents&buffer&AGGREGATION -contents&disk&依赖 -contents&background&依赖 -map&output record&依赖 -serialization buffer&threshold&依赖 -on-disk segment&single file&依赖 -record&disk&依赖 -larger buffer&memory&依赖 -number&spill&AGGREGATION -name type description io.sort.mb int&serialization and accounting buffer&依赖 -name type description io.sort.mb int&cumulative size&依赖 -cumulative size&serialization and accounting buffer&AGGREGATION -ratio&serialization&AGGREGATION -serialized record&information&依赖 -serialized record&information&依赖 -serialized record&16 byte&依赖 -serialized record&16 byte&依赖 -serialized record&information&依赖 -16 byte&information&AGGREGATION -serialized record&16 byte&依赖 -serialized record&16 byte&依赖 -its&size& -serialized record&information&依赖 -percentage&probability&依赖 -probability&spill&AGGREGATION -exhaustion&serialization buffer&AGGREGATION -percentage&spill&依赖 -percentage&disk being&依赖 -percentage&space&AGGREGATION -higher value&disk&依赖 -higher value&number&依赖 -higher value&number&依赖 -higher value&spill&依赖 -higher value&spill&依赖 -higher value&spill&依赖 -higher value&disk&依赖 -higher value&disk&依赖 -higher value&number&依赖 -higher value&spill&依赖 -higher value&number&依赖 -higher value&disk&依赖 -contents&disk&依赖 -percentage&buffer&AGGREGATION -their&contents& -contents&background&依赖 -*&* 2 ^ 16&依赖 -maximum number&record&AGGREGATION -higher value&merge&依赖 -higher value&number&依赖 -higher value&eliminate&依赖 -number&merge&AGGREGATION -probability&map task&AGGREGATION -size&map output&AGGREGATION -map output&output&GENERALIZATION -0.66&buffer&AGGREGATION -io.sort.buffer.spill.percent&0.33&依赖 -next spill&all collect record&依赖 -remainder&buffer&AGGREGATION -threshold&other word&依赖 -threshold&trigger&依赖 -record&spill&依赖 -record&combiner&依赖 -Shuffle/Reduce Parameters&output&依赖 -Shuffle/Reduce Parameters&output&依赖 -output&memory&依赖 -intermediate compression&map output&AGGREGATION -option&merge&依赖 -option&merge&依赖 -option&frequency&依赖 -frequency&merge&AGGREGATION -option&frequency&依赖 -option&frequency&依赖 -option&merge&依赖 -number&segment&AGGREGATION -number&same time&依赖 -It&number&依赖 -It&open file and compression codec&依赖 -It&open file and compression codec&依赖 -number&open file and compression codec&AGGREGATION -It&number&依赖 -merge&several pass&依赖 -number&file&AGGREGATION -number&limit&依赖 -limit&map&依赖 -number&sorted map output&AGGREGATION -unit&partition&AGGREGATION -threshold&only frequency&依赖 -only frequency&in-memory merge&AGGREGATION -threshold&in-memory merge&依赖 -threshold&only frequency&依赖 -threshold&in-memory merge&依赖 -memory threshold&threshold&GENERALIZATION -mapred.job.shuffle.merge.percent float&memory threshold&依赖 -mapred.job.shuffle.merge.percent float&fetched map output&依赖 -mapred.job.shuffle.merge.percent float&fetched map output&依赖 -percentage&memory&AGGREGATION -mapred.job.shuffle.merge.percent float&memory threshold&依赖 -mapred.job.shuffle.merge.percent float&fetched map output&依赖 -mapred.job.shuffle.merge.percent float&memory threshold&依赖 -whose&input& -parameter&only frequency&依赖 -parameter&only frequency&依赖 -parameter&in-memory merge&依赖 -parameter&in-memory merge&依赖 -mapred.job.shuffle.input.buffer.percent float&percentage&依赖 -mapred.job.shuffle.input.buffer.percent float&memory&依赖 -it&large and numerous map output&依赖 -memory&framework&依赖 -mapred.job.reduce.input.buffer.percent float&percentage&依赖 -mapred.job.reduce.input.buffer.percent float&memory relative&依赖 -percentage&memory relative&AGGREGATION -map output&disk&依赖 -map output&default&依赖 -larger than 25 percent&memory&AGGREGATION -it&disk&依赖 -combiner&merge&依赖 -one&time&依赖 -part&intermediate merge&AGGREGATION -in-memory map output&intermediate merge&依赖 -Directory Structure&localized cache&依赖 -Directory Structure&local directory , $ { mapred.local.dir } / taskTracker /&依赖 -Directory Structure&localized cache&依赖 -Directory Structure&local directory , $ { mapred.local.dir } / taskTracker /&依赖 -It&multiple local directory&依赖 -filename&semi-random local directory&依赖 -task tracker&localized job directory&依赖 -job&user&AGGREGATION -directory&localized public distributed cache&依赖 -localized public distributed cache&user&依赖 -tasks and job&user&AGGREGATION -localized public distributed cache&tasks and job&依赖 -job&specific user&AGGREGATION -directory&localized private distributed cache&依赖 -tasks and job&specific user&AGGREGATION -localized private distributed cache&tasks and job&依赖 -localized private distributed cache&specific user&依赖 -job&other user&AGGREGATION -It&job&依赖 -It&other user&依赖 -task&space&依赖 -task&them&依赖 -task&scratch space and share file&依赖 -directory&configuration property job.local.dir&依赖 -directory&user&依赖 -directory&API JobConf.getJobLocalDir ( )&依赖 -System property&property&GENERALIZATION -It&System property&依赖 -user&directory&依赖 -user&system.getproperty&依赖 -user&directory&依赖 -user&system.getproperty&依赖 -jars directory&job jar file&依赖 -jars directory&directory&GENERALIZATION -application&file& -It&task&依赖 -It&job start&依赖 -It&jars directory&依赖 -job.jar location&api JobConf.getJar ( )&依赖 -job.jar location&location&GENERALIZATION -job.jar location&application&依赖 -task directory&directory&GENERALIZATION -task directory&structure&依赖 -current working directory&$ taskid/work&依赖 -current working directory&etc .&依赖 -jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 -jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 -jvm&temporary directory&依赖 -directory&jvm reuse&依赖 -jvm&temporary directory&依赖 -jvm&temporary directory&依赖 -jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 -jvm&temporary directory&依赖 -jvm&temporary directory&依赖 -jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 -jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 -value&temporary directory&AGGREGATION -( user&property mapred.child.tmp&依赖 -property mapred.child.tmp&mapred.child.tmp&GENERALIZATION -( user&value&依赖 -( user&temporary directory&依赖 -( user&map&依赖 -task&directory& -it&work directory&依赖 -Djava.io.tmpdir&tmp dir '&依赖 -absolute path&tmp dir '&AGGREGATION -Djava.io.tmpdir&absolute path&依赖 -child java task&option&依赖 -TMPDIR = '&tmp dir ' )&AGGREGATION -Pipes and streaming&environment variable&依赖 -Pipes and streaming&tmp dir ' )&依赖 -Pipes and streaming&TMPDIR = '&依赖 -mapred.child.tmp&value&依赖 -/ tmp Task JVM Reuse Jobs&task jvm&依赖 -number&same job )&AGGREGATION -task&same job )&AGGREGATION -number&task&AGGREGATION -One&value greater than 1 and ( int&依赖 -task&execution& -id&task&AGGREGATION -start&map input split&AGGREGATION -number&temporary output directory note&依赖 -execution&streaming job&AGGREGATION -mapred.jar string job.jar location&string task&依赖 -mapred.jar string job.jar location&string task&依赖 -name&" mapred " parameter&AGGREGATION -mapred.jar string job.jar location&string task&依赖 -name&streaming job&依赖 -number&byte&AGGREGATION -streaming job&job&GENERALIZATION -name&execution&依赖 -job&mapper/reducer& -task logs standard output ( stdout ) and error ( stderr&task&AGGREGATION -native library&task&依赖 -native library&task&依赖 -native library&task&依赖 -child-jvm&java.library.path and LD_LIBRARY_PATH&依赖 -child-jvm¤t working directory&依赖 -its&directory& -cached library&System.loadLibrary or System.load&依赖 -More detail&native_libraries.html&依赖 -user-job&primary interface&依赖 -user-job&JobTracker&依赖 -component-tasks&reports& -cluster&information& -their&progress& -input and output specification&job&AGGREGATION -DistributedCache&job&AGGREGATION -job&jar& -Job history file&user specified directory hadoop.job.history.user.location&依赖 -file&specified directory&依赖 -file&" _ logs/history /&依赖 -they&default&依赖 -command&job detail&依赖 -User&history logs summary&依赖 -User&command $ bin/hadoop job&依赖 -history&output directory listing&依赖 -history&filter log file&依赖 -history&output directory listing&依赖 -history&OutputLogFilter&依赖 -history&filter log file&依赖 -history&OutputLogFilter&依赖 -user&application&依赖 -Job Authorization Job level authorization and queue level authorization&cluster&依赖 -user&job detail&依赖 -access control check&JobTracker&依赖 -access control check&user&依赖 -job submitter&job&依赖 -job submitter&access control list&依赖 -job submitter&configuration property&依赖 -owner mapred.queue.queue-name .&access&依赖 -queue administrator&queue&AGGREGATION -owner mapred.queue.queue-name .&access&依赖 -job&owner& -job&queue&依赖 -job view ACL&user&依赖 -job view ACL&configured mapreduce.job.acl-view-job&依赖 -job level counter task level counter task&web ui other information&依赖 -its&profile& -job level counter task level counter task&user&依赖 -JobTracker&information& -job level counter task level counter task&web ui other information&依赖 -its&status& -job level counter task level counter task&job&依赖 -job level counter task level counter task&status&依赖 -job level counter task level counter task&status&依赖 -job level counter task level counter task&user&依赖 -tasks&task& -job level counter task level counter task&job&依赖 -job modification ACL&user&依赖 -job modification ACL&configured mapreduce.job.acl-modify-job&依赖 -priority&job&AGGREGATION -operation&queue level acl&依赖 -operation&queue level acl&依赖 -caller&operation&依赖 -he/she&queue admin acl or job modification acl&依赖 -part&queue admin acl or job modification acl&AGGREGATION -format&job level ACL&AGGREGATION -Job Control Users&complex task&依赖 -output&turn&依赖 -output&distributed file-system&依赖 -output&next job&依赖 -output&distributed file-system&依赖 -output&input&依赖 -various job-control option be&such case&依赖 -various job-control option be&such case&依赖 -submitjob ( jobconf )&job&依赖 -jobconf.setjobendnotificationuri ( string )&polling&依赖 -Kerberos&command& -user&secure cluster&依赖 -user&Job Credentials&依赖 -user&' kinit command&依赖 -Job Credentials&Credentials&GENERALIZATION -we&scalability concern&依赖 -we&' ticket&依赖 -we&MapReduce job&依赖 -Kerberos&tickets& -client&Kerberos& -we&delegation token&依赖 -we&them&依赖 -part&job submission&AGGREGATION -delegation token&hdf&依赖 -HDFS system&FileInputFormats&依赖 -hdf&staging directory&依赖 -Other application&configuration " mapreduce.job.hdfs-servers "&依赖 -comma separated list&file system name&AGGREGATION -token&part&依赖 -token&JobTracker&依赖 -token&Credentials&依赖 -token&job submission&依赖 -we&MapReduce delegation token&依赖 -task&job&依赖 -task&JobTracker&依赖 -task&MapReduce delegation token&依赖 -delegation token&JobClient.getDelegationToken&依赖 -delegation token&token&GENERALIZATION -delegation token&API&依赖 -obtained token&credentials&依赖 -credentials&part&依赖 -credentials&JobTracker&依赖 -part&job submission process&AGGREGATION -credentials&job submission process&依赖 -JobTracker&its filesystem (&依赖 -JobTracker&its filesystem (&依赖 -JobTracker&hdf&依赖 -JobTracker&hdf&依赖 -JobTracker&tokens and secret&依赖 -JobTracker&tokens and secret&依赖 -JobTracker&tokens and secret&依赖 -its&filesystem& -JobTracker&its filesystem (&依赖 -JobTracker&hdf&依赖 -TaskTracker&part job localization&依赖 -TaskTracker&file&依赖 -task&environment variable&依赖 -task&HADOOP_TOKEN_FILE_LOCATION&依赖 -task&configuration " mapreduce.job.credentials.binary "&依赖 -HDFS delegation token&JobTracker&依赖 -task&job&依赖 -task&job&依赖 -whose&tasks& -job&same token&依赖 -arbitrary secret&task&依赖 -arbitrary secret&access other third party service&依赖 -arbitrary secret&HDFS delegation token&依赖 -arbitrary secret&job submission&依赖 -Mapper/Reducer class&JobConfigurable&实现 -similar thing&new MapReduce API&依赖 -similar thing&Mapper.setup method&依赖 -task&api&依赖 -task&api&依赖 -task&secret&依赖 -task&secret&依赖 -Job Input InputFormat&input-specification&依赖 -Job Input InputFormat&MapReduce job&依赖 -InputFormat&job to&AGGREGATION -input-specification&job&AGGREGATION -input file&file&GENERALIZATION -sub-class&FileInputFormat&AGGREGATION -default behavior&input&依赖 -default behavior&file-based InputFormat implementation&AGGREGATION -total size&input file&AGGREGATION -byte&input file&AGGREGATION -FileSystem blocksize&input file&AGGREGATION -logical split&many application&依赖 -logical split&many application&依赖 -application&RecordReader&实现 -application&such case&实现 -record-oriented view&logical InputSplit&AGGREGATION -TextInputFormat&given job&依赖 -framework&input-file&依赖 -framework&input-file with ###&依赖 -them&appropriate CompressionCodec&依赖 -its&entirety& -compressed file&single mapper&依赖 -compressed file&entirety&依赖 -InputSplit InputSplit&InputSplit&GENERALIZATION -InputSplit InputSplit&datum&依赖 -it&RecordReader&依赖 -InputSplit&byte-oriented view&依赖 -byte-oriented view&input&AGGREGATION -responsibility&RecordReader&AGGREGATION -InputSplit&input&依赖 -It&path&依赖 -It&logical split&依赖 -It&map.input.file&依赖 -path&input file&AGGREGATION -RecordReader RecordReader&InputSplit&依赖 -RecordReader RecordReader&< key , value > pair&依赖 -RecordReader RecordReader&RecordReader&GENERALIZATION -RecordReader&byte-oriented view&依赖 -RecordReader&input&依赖 -RecordReader&responsibility&依赖 -RecordReader&processing record boundary&依赖 -Job Output OutputFormat&MapReduce job&依赖 -Job Output OutputFormat&output-specification&依赖 -OutputFormat&job to&AGGREGATION -output-specification&job&AGGREGATION -output file&job&AGGREGATION -Output file&FileSystem&依赖 -commit&task output&AGGREGATION -OutputCommitter OutputCommitter&OutputCommitter&GENERALIZATION -OutputCommitter OutputCommitter&MapReduce job&依赖 -OutputCommitter OutputCommitter&commit&依赖 -OutputCommitter&job to&AGGREGATION -MapReduce framework&OutputCommitter&依赖 -MapReduce framework&job to&依赖 -initialization&job&AGGREGATION -Job setup&separate task&依赖 -job&state&依赖 -Job cleanup&separate task&依赖 -Job cleanup&end&依赖 -Job cleanup&job&依赖 -end&job&AGGREGATION -Task setup&same task&依赖 -part&same task&AGGREGATION -Task setup&part&依赖 -Task setup&task initialization&依赖 -task&exception block )&依赖 -Job setup/cleanup task&slot&依赖 -Job setup/cleanup task&map&依赖 -JobCleanup task&task&GENERALIZATION -JobCleanup task&highest priority&依赖 -two instance&same Mapper or Reducer&AGGREGATION -application-writer&using&依赖 -application-writer&unique name&依赖 -output&task-attempt&AGGREGATION -MapReduce framework&_ $ { taskid&依赖 -MapReduce framework&FileSystem&依赖 -successful completion&task-attempt&AGGREGATION -file&task-attempt&依赖 -file&successful completion&依赖 -file&$ { mapred.output.dir }&依赖 -sub-directory&unsuccessful task-attempt&AGGREGATION -framework&sub-directory&依赖 -framework&sub-directory&依赖 -framework&unsuccessful task-attempt&依赖 -framework&unsuccessful task-attempt&依赖 -process&application&依赖 -execution&task&AGGREGATION -$ { mapred.work.output.dir }&task&AGGREGATION -advantage&feature&AGGREGATION -framework&succesful task-attempt&依赖 -application-writer&advantage&依赖 -framework&them&依赖 -application-writer&feature&依赖 -value&MapReduce framework&依赖 -value&$ { mapred.work.output.dir }&AGGREGATION -execution&particular task-attempt&AGGREGATION -output&hdf&依赖 -map&job&AGGREGATION -output&case&依赖 -output&hdf&依赖 -output&case&依赖 -RecordWriter RecordWriter&output < key , value > pair&依赖 -RecordWriter RecordWriter&output file&依赖 -output file&file&GENERALIZATION -RecordWriter implementation&FileSystem&依赖 -RecordWriter implementation&job output&依赖 -Other Useful Features Submitting Jobs&queue&依赖 -Other Useful Features Submitting Jobs&job&依赖 -collection&job&AGGREGATION -queue&acl&依赖 -queue&example&依赖 -who&job&依赖 -who&them&依赖 -Hadoop&single mandatory queue&依赖 -mapred.queue.names property&Hadoop site configuration&AGGREGATION -Queue name&Hadoop site configuration&依赖 -Queue name&mapred.queue.names property&依赖 -job scheduler&support multiple queue&依赖 -job scheduler&support multiple queue&依赖 -job&queue&依赖 -it&' default ' queue&依赖 -job&associated queue name&依赖 -'&queue& -Counters counter&global counter&依赖 -group&type Counters.Group&AGGREGATION -counter&type Counters.Group&依赖 -counter&particular Enum&AGGREGATION -counter&group&依赖 -application&type enum )&依赖 -application&arbitrary counter (&依赖 -arbitrary counter (&type enum )&AGGREGATION -DistributedCache DistributedCache&application-specific , large , read-only file&依赖 -DistributedCache DistributedCache&DistributedCache&GENERALIZATION -application&file&依赖 -/&/&GENERALIZATION -job&node&依赖 -framework&task&依赖 -framework&necessary file&依赖 -framework&necessary file&依赖 -slave node&node&GENERALIZATION -efficiency&fact&依赖 -Its&efficiency& -DistributedCache&cached file&依赖 -DistributedCache&modification timestamp&依赖 -modification timestamp&cached file&AGGREGATION -cache file&application or externally&依赖 -archive&slave node&依赖 -{ file&| archives }&依赖 -they&comma separated path&依赖 -property&distributedcache.addcachearchive ( uri , conf )&依赖 -property&api distributedcache.addcachefile ( uri , conf )&依赖 -user&DistributedCache&依赖 -name&symlink&AGGREGATION -DistributedCache&URI&依赖 -DistributedCache&URI&依赖 -DistributedCache&fragment&依赖 -DistributedCache&URI&依赖 -fragment&URI&AGGREGATION -DistributedCache&fragment&依赖 -DistributedCache&fragment&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -symlink name&name&GENERALIZATION -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -uri hdf&symlink name&依赖 -task&cwd& -uri hdf&lib.so&依赖 -uri hdf&symlink name&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -uri hdf&lib.so&依赖 -DistributedCache&reduce&依赖 -DistributedCache&use&依赖 -DistributedCache&rudimentary software distribution mechanism&依赖 -classpath&child-jvm&AGGREGATION -distributedcache.addarchivetoclasspath ( path&cache files/jars&依赖 -directory&task&AGGREGATION -they&slave node&依赖 -whose job&file&依赖 -whose&jobs& -Private " DistributedCache file&local directory&依赖 -file&specific user&依赖 -file&tasks and job&依赖 -its&permissions& -virtue&permission&AGGREGATION -directory path&lookup&依赖 -file&world readable access&依赖 -directory path&lookup&依赖 -directory path&world executable access&依赖 -directory path&world executable access&依赖 -directory path&path&GENERALIZATION -" Public " DistributedCache file&global directory&依赖 -file&tasks and job&依赖 -file&user&依赖 -file&slave&依赖 -file&user&依赖 -Tool The Tool interface&handling&依赖 -handling&generic Hadoop command-line option&AGGREGATION -Tool The Tool interface&generic Hadoop command-line option&依赖 -Tool&MapReduce tool or application&依赖 -application&standard command-line option&依赖 -its&arguments& -handling&standard command-line option&AGGREGATION -application&handling&依赖 -TaskTracker&directory& -$ cd / taskTracker /&$ bin/hadoop org.apache.hadoop.mapred.IsolationRunner&依赖 -failed task&node&依赖 -IsolationRunner&same input&依赖 -IsolationRunner&single jvm&依赖 -IsolationRunner&failed task&依赖 -IsolationRunner&map task&依赖 -3 ) sample&built-in java profiler&AGGREGATION -sample&map&AGGREGATION -User&profiler information&依赖 -profiler information&user log directory&依赖 -profiling&default&依赖 -profiling&job&依赖 -she/he&configuration property mapred.task.profile&依赖 -configuration property&property&GENERALIZATION -{ maps |&MapReduce task&依赖 -{ maps |&range&依赖 -{ maps |&reduce }&依赖 -range&MapReduce task&AGGREGATION -specified range&default&依赖 -User&profiler configuration argument&依赖 -string&a %&依赖 -it&name&依赖 -it&profiling output file&依赖 -name&profiling output file&AGGREGATION -parameter&command line&依赖 -parameter&task child JVM&依赖 -file = %&file = %&依赖 -user&debug script&依赖 -task&stdout& -output&console diagnostic&依赖 -script&stdout& -part&job uus&AGGREGATION -we&debug script&依赖 -user&DistributedCache&依赖 -a quick way&value&依赖 -a quick way&property&依赖 -a quick way&map&依赖 -debug script&command-line option&依赖 -debug script&streaming mode&依赖 -streaming mode&mode&GENERALIZATION -task&files& -Pipes program&c++ program name&依赖 -Pipes program&command&依赖 -Pipes program&fifth argument&依赖 -their&dependencies& -set&MapReduce job&AGGREGATION -utility&MapReduce job&依赖 -utility&set&依赖 -job-outputs i.e. output&reduce&AGGREGATION -both performance ( zlib ) and non-availability&Java library&AGGREGATION -native implementation&above compression codec&AGGREGATION -reason&both performance ( zlib ) and non-availability&AGGREGATION -their&usage& -compression&intermediate map-output&AGGREGATION -Intermediate Outputs application&compression&依赖 -Intermediate Outputs application&intermediate map-output&依赖 -Intermediate Outputs application&intermediate map-output&依赖 -Intermediate Outputs application&compression&依赖 -job outputs application&compression&依赖 -job outputs application&fileoutputformat.setcompressoutput ( jobconf , boolean ) apus&依赖 -compression&job-output&AGGREGATION -job outputs application&fileoutputformat.setoutputcompressorclass ( jobconf&依赖 -job outputs application&class ) apus and fileoutputformat.setoutputcompressorclass ( jobconf&依赖 -require sequencefile.compressiontype ( i.e. record&sequencefileoutputformat.setoutputcompressiontype ( jobconf , sequencefile.compressiontype ) apus&依赖 -certain set&bad input record&AGGREGATION -application&feature&依赖 -application&SkipBadRecords class&依赖 -map task crash&certain input&依赖 -map task crash&certain input&依赖 -user&bug&依赖 -source code&which&依赖 -bug&example&依赖 -task&such case&依赖 -task&multiple attempt&依赖 -small portion&datum&AGGREGATION -feature&default&依赖 -'&certain number&依赖 -'&map failure&依赖 -'&mode&依赖 -certain number&map failure&AGGREGATION -map task&record&依赖 -' skip mode '&' skip mode '&依赖 -map task&range&依赖 -range&record&AGGREGATION -framework&processed record counter&依赖 -skipbadrecords.counter _ map_processed_records and skipbadrecords.counter _ reduce_processed_groups&skipbadrecords.counter _ map_processed_records and skipbadrecords.counter _ reduce_processed_groups&依赖 -counter&framework&依赖 -what record range&task&依赖 -what record range&what record range&依赖 -range&further attempt&依赖 -number&record&AGGREGATION -processed record counter&application&依赖 -application&processing&依赖 -their&processing& -framework&additional record&依赖 -framework&bad record&依赖 -framework&bad record&依赖 -framework&additional record&依赖 -user&skipped record&依赖 -number&skipped record&AGGREGATION -user&number&依赖 -user&skipped record&依赖 -user&skipped record&依赖 -user&number&依赖 -user&number&依赖 -framework&range&依赖 -range&skipped record&AGGREGATION -framework&binary search-like approach&依赖 -framework&skipped record&依赖 -skipped range&two half&依赖 -framework&bad record&依赖 -number&task attempt&AGGREGATION -Skipped record&sequence file format&依赖 -Skipped record&hdf&依赖 -Skipped record&later analysis&依赖 -location&skipbadrecords.setskipoutputpath ( jobconf , path )&依赖 -many&feature&AGGREGATION -more complete WordCount&feature&依赖 -more complete WordCount&many&依赖 -it&pseudo-distributed or fully-distributed Hadoop installation&依赖 -public class WordCount&Tool { 14&实现 -private boolean casesensitive = true ; 23&private boolean casesensitive = true ; 23&依赖 -private long numrecord = 0&26&依赖 -private long numrecord = 0&private long numrecord = 0&依赖 -public void configure ( jobconf job&) { 29&依赖 -JobConf&job&GENERALIZATION -( job.getboolean&false ) ) { 33&依赖 -( job.getboolean&false ) ) { 33&依赖 -private void parseskipfile ( path patternsfile&) { 46&依赖 -file&+& -public void map&) throw ioexception { 58&依赖 -public void map&) throw ioexception { 58&依赖 -public void map&) throw ioexception { 58&依赖 -public void map&) throw ioexception { 58&依赖 -while ( tokenizer.hasmoretokens&) ) { 66&依赖 -100 ) == 0 ) { 72&( ( + + numrecord&依赖 -100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 -100 ) == 0 ) { 72&( ( + + numrecord&依赖 -100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 -100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 -100 ) == 0 ) { 72&( ( + + numrecord&依赖 -intwritable > { 78&intwritable > { 78&依赖 -public void reduce&) throw ioexception { 79&依赖 -public void reduce&) throw ioexception { 79&依赖 -public void reduce&) throw ioexception { 79&依赖 -public void reduce&) throw ioexception { 79&依赖 -while ( values.hasnext ( ) ) { 81&while ( values.hasnext ( ) ) { 81&依赖 -public int&Exception { 88&依赖 -public int&Exception { 88&依赖 -public int&Exception { 88&依赖 -} else { 107&} else { 107&依赖 -public static void main ( string [ ] arg&Exception { 119&依赖 -they&output&依赖 -plug-in a pattern-file&word-pattern&依赖 -let&DistributedCache&依赖 -let&DistributedCache&依赖 -let&plug-in a pattern-file&依赖 -let&plug-in a pattern-file&依赖 -second version&previous one&依赖 -2 highlight&usr/joe/wordcount / output/part -00000 bye 1 goodbye 1 hadoop 2 hello 2 world&依赖 -second version&WordCount&AGGREGATION -second version&previous one&依赖 -application&configuration parameter&依赖 -configure method&mapper ( and reducer ) implementation ( line&AGGREGATION -it&word-pattern&依赖 -it&user&依赖 -it&skip&依赖 -utility&Tool interface&AGGREGATION -application&counters ( line 68 )&依赖 -they&application-specific status information&依赖 -they&Reporter instance&依赖 -registered trademark&Sun Microsystems , Inc.&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt deleted file mode 100644 index f5c558ffa97667ffdd0579e7dc81ebfa0866d610..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt +++ /dev/null @@ -1,853 +0,0 @@ -Purpose -This document comprehensively describes all user-facing facets of the Hadoop MapReduce framework and serves as a tutorial. - -Prerequisites -Ensure that Hadoop is installed, configured and is running. More details: - -Single Node Setup for first-time users. -Cluster Setup for large, distributed clusters. -Overview -Hadoop MapReduce is a software framework for easily writing applications which process vast amounts of data (multi-terabyte data-sets) in-parallel on large clusters (thousands of nodes) of commodity hardware in a reliable, fault-tolerant manner. - -A MapReduce job usually splits the input data-set into independent chunks which are processed by the map tasks in a completely parallel manner. The framework sorts the outputs of the maps, which are then input to the reduce tasks. Typically both the input and the output of the job are stored in a file-system. The framework takes care of scheduling tasks, monitoring them and re-executes the failed tasks. - -Typically the compute nodes and the storage nodes are the same, that is, the MapReduce framework and the Hadoop Distributed File System (see HDFS Architecture Guide) are running on the same set of nodes. This configuration allows the framework to effectively schedule tasks on the nodes where data is already present, resulting in very high aggregate bandwidth across the cluster. - -The MapReduce framework consists of a single master JobTracker and one slave TaskTracker per cluster-node. The master is responsible for scheduling the jobs' component tasks on the slaves, monitoring them and re-executing the failed tasks. The slaves execute the tasks as directed by the master. - -Minimally, applications specify the input/output locations and supply map and reduce functions via implementations of appropriate interfaces and/or abstract-classes. These, and other job parameters, comprise the job configuration. The Hadoop job client then submits the job (jar/executable etc.) and configuration to the JobTracker which then assumes the responsibility of distributing the software/configuration to the slaves, scheduling tasks and monitoring them, providing status and diagnostic information to the job-client. - -Although the Hadoop framework is implemented in JavaTM, MapReduce applications need not be written in Java. - -Hadoop Streaming is a utility which allows users to create and run jobs with any executables (e.g. shell utilities) as the mapper and/or the reducer. -Hadoop Pipes is a SWIG- compatible C++ API to implement MapReduce applications (non JNITM based). -Inputs and Outputs -The MapReduce framework operates exclusively on pairs, that is, the framework views the input to the job as a set of pairs and produces a set of pairs as the output of the job, conceivably of different types. - -The key and value classes have to be serializable by the framework and hence need to implement the Writable interface. Additionally, the key classes have to implement the WritableComparable interface to facilitate sorting by the framework. - -Input and Output types of a MapReduce job: - -(input) -> map -> -> combine -> -> reduce -> (output) - -Example: WordCount v1.0 -Before we jump into the details, lets walk through an example MapReduce application to get a flavour for how they work. - -WordCount is a simple application that counts the number of occurences of each word in a given input set. - -This works with a local-standalone, pseudo-distributed or fully-distributed Hadoop installation (Single Node Setup). - -Source Code -WordCount.java -1. package org.myorg; -2. -3. import java.io.IOException; -4. import java.util.*; -5. -6. import org.apache.hadoop.fs.Path; -7. import org.apache.hadoop.conf.*; -8. import org.apache.hadoop.io.*; -9. import org.apache.hadoop.mapred.*; -10. import org.apache.hadoop.util.*; -11. -12. public class WordCount { -13. -14. public static class Map extends MapReduceBase implements Mapper { -15. private final static IntWritable one = new IntWritable(1); -16. private Text word = new Text(); -17. -18. public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { -19. String line = value.toString(); -20. StringTokenizer tokenizer = new StringTokenizer(line); -21. while (tokenizer.hasMoreTokens()) { -22. word.set(tokenizer.nextToken()); -23. output.collect(word, one); -24. } -25. } -26. } -27. -28. public static class Reduce extends MapReduceBase implements Reducer { -29. public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { -30. int sum = 0; -31. while (values.hasNext()) { -32. sum += values.next().get(); -33. } -34. output.collect(key, new IntWritable(sum)); -35. } -36. } -37. -38. public static void main(String[] args) throws Exception { -39. JobConf conf = new JobConf(WordCount.class); -40. conf.setJobName("wordcount"); -41. -42. conf.setOutputKeyClass(Text.class); -43. conf.setOutputValueClass(IntWritable.class); -44. -45. conf.setMapperClass(Map.class); -46. conf.setCombinerClass(Reduce.class); -47. conf.setReducerClass(Reduce.class); -48. -49. conf.setInputFormat(TextInputFormat.class); -50. conf.setOutputFormat(TextOutputFormat.class); -51. -52. FileInputFormat.setInputPaths(conf, new Path(args[0])); -53. FileOutputFormat.setOutputPath(conf, new Path(args[1])); -54. -55. JobClient.runJob(conf); -57. } -58. } -59. -Usage -Assuming HADOOP_HOME is the root of the installation and HADOOP_VERSION is the Hadoop version installed, compile WordCount.java and create a jar: - -$ mkdir wordcount_classes -$ javac -classpath ${HADOOP_HOME}/hadoop-${HADOOP_VERSION}-core.jar -d wordcount_classes WordCount.java -$ jar -cvf /usr/joe/wordcount.jar -C wordcount_classes/ . - -Assuming that: - -/usr/joe/wordcount/input - input directory in HDFS -/usr/joe/wordcount/output - output directory in HDFS -Sample text-files as input: - -$ bin/hadoop dfs -ls /usr/joe/wordcount/input/ -/usr/joe/wordcount/input/file01 -/usr/joe/wordcount/input/file02 - -$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01 -Hello World Bye World - -$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02 -Hello Hadoop Goodbye Hadoop - -Run the application: - -$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount /usr/joe/wordcount/input /usr/joe/wordcount/output - -Output: - -$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 -Bye 1 -Goodbye 1 -Hadoop 2 -Hello 2 -World 2 -Applications can specify a comma separated list of paths which would be present in the current working directory of the task using the option -files. The -libjars option allows applications to add jars to the classpaths of the maps and reduces. The option -archives allows them to pass comma separated list of archives as arguments. These archives are unarchived and a link with name of the archive is created in the current working directory of tasks. More details about the command line options are available at Commands Guide. - -Running wordcount example with -libjars, -files and -archives: -hadoop jar hadoop-examples.jar wordcount -files cachefile.txt -libjars mylib.jar -archives myarchive.zip input output Here, myarchive.zip will be placed and unzipped into a directory by the name "myarchive.zip". - -Users can specify a different symbolic name for files and archives passed through -files and -archives option, using #. - -For example, hadoop jar hadoop-examples.jar wordcount -files dir1/dict.txt#dict1,dir2/dict.txt#dict2 -archives mytar.tgz#tgzdir input output Here, the files dir1/dict.txt and dir2/dict.txt can be accessed by tasks using the symbolic names dict1 and dict2 respectively. The archive mytar.tgz will be placed and unarchived into a directory by the name "tgzdir". - -Walk-through -The WordCount application is quite straight-forward. - -The Mapper implementation (lines 14-26), via the map method (lines 18-25), processes one line at a time, as provided by the specified TextInputFormat (line 49). It then splits the line into tokens separated by whitespaces, via the StringTokenizer, and emits a key-value pair of < , 1>. - -For the given sample input the first map emits: -< Hello, 1> -< World, 1> -< Bye, 1> -< World, 1> -The second map emits: -< Hello, 1> -< Hadoop, 1> -< Goodbye, 1> -< Hadoop, 1> -We'll learn more about the number of maps spawned for a given job, and how to control them in a fine-grained manner, a bit later in the tutorial. - -WordCount also specifies a combiner (line 46). Hence, the output of each map is passed through the local combiner (which is same as the Reducer as per the job configuration) for local aggregation, after being sorted on the keys. - -The output of the first map: -< Bye, 1> -< Hello, 1> -< World, 2> -The output of the second map: -< Goodbye, 1> -< Hadoop, 2> -< Hello, 1> -The Reducer implementation (lines 28-36), via the reduce method (lines 29-35) just sums up the values, which are the occurence counts for each key (i.e. words in this example). - -Thus the output of the job is: -< Bye, 1> -< Goodbye, 1> -< Hadoop, 2> -< Hello, 2> -< World, 2> -The run method specifies various facets of the job, such as the input/output paths (passed via the command line), key/value types, input/output formats etc., in the JobConf. It then calls the JobClient.runJob (line 55) to submit the and monitor its progress. - -We'll learn more about JobConf, JobClient, Tool and other interfaces and classes a bit later in the tutorial. - -MapReduce - User Interfaces -This section provides a reasonable amount of detail on every user-facing aspect of the MapReduce framework. This should help users implement, configure and tune their jobs in a fine-grained manner. However, please note that the javadoc for each class/interface remains the most comprehensive documentation available; this is only meant to be a tutorial. - -Let us first take the Mapper and Reducer interfaces. Applications typically implement them to provide the map and reduce methods. - -We will then discuss other core interfaces including JobConf, JobClient, Partitioner, OutputCollector, Reporter, InputFormat, OutputFormat, OutputCommitter and others. - -Finally, we will wrap up by discussing some useful features of the framework such as the DistributedCache, IsolationRunner etc. - -Payload -Applications typically implement the Mapper and Reducer interfaces to provide the map and reduce methods. These form the core of the job. - -Mapper -Mapper maps input key/value pairs to a set of intermediate key/value pairs. - -Maps are the individual tasks that transform input records into intermediate records. The transformed intermediate records do not need to be of the same type as the input records. A given input pair may map to zero or many output pairs. - -The Hadoop MapReduce framework spawns one map task for each InputSplit generated by the InputFormat for the job. - -Overall, Mapper implementations are passed the JobConf for the job via the JobConfigurable.configure(JobConf) method and override it to initialize themselves. The framework then calls map(WritableComparable, Writable, OutputCollector, Reporter) for each key/value pair in the InputSplit for that task. Applications can then override the Closeable.close() method to perform any required cleanup. - -Output pairs do not need to be of the same types as input pairs. A given input pair may map to zero or many output pairs. Output pairs are collected with calls to OutputCollector.collect(WritableComparable,Writable). - -Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. - -All intermediate values associated with a given output key are subsequently grouped by the framework, and passed to the Reducer(s) to determine the final output. Users can control the grouping by specifying a Comparator via JobConf.setOutputKeyComparatorClass(Class). - -The Mapper outputs are sorted and then partitioned per Reducer. The total number of partitions is the same as the number of reduce tasks for the job. Users can control which keys (and hence records) go to which Reducer by implementing a custom Partitioner. - -Users can optionally specify a combiner, via JobConf.setCombinerClass(Class), to perform local aggregation of the intermediate outputs, which helps to cut down the amount of data transferred from the Mapper to the Reducer. - -The intermediate, sorted outputs are always stored in a simple (key-len, key, value-len, value) format. Applications can control if, and how, the intermediate outputs are to be compressed and the CompressionCodec to be used via the JobConf. - -How Many Maps? -The number of maps is usually driven by the total size of the inputs, that is, the total number of blocks of the input files. - -The right level of parallelism for maps seems to be around 10-100 maps per-node, although it has been set up to 300 maps for very cpu-light map tasks. Task setup takes awhile, so it is best if the maps take at least a minute to execute. - -Thus, if you expect 10TB of input data and have a blocksize of 128MB, you'll end up with 82,000 maps, unless setNumMapTasks(int) (which only provides a hint to the framework) is used to set it even higher. - -Reducer -Reducer reduces a set of intermediate values which share a key to a smaller set of values. - -The number of reduces for the job is set by the user via JobConf.setNumReduceTasks(int). - -Overall, Reducer implementations are passed the JobConf for the job via the JobConfigurable.configure(JobConf) method and can override it to initialize themselves. The framework then calls reduce(WritableComparable, Iterator, OutputCollector, Reporter) method for each pair in the grouped inputs. Applications can then override the Closeable.close() method to perform any required cleanup. - -Reducer has 3 primary phases: shuffle, sort and reduce. - -Shuffle -Input to the Reducer is the sorted output of the mappers. In this phase the framework fetches the relevant partition of the output of all the mappers, via HTTP. - -Sort -The framework groups Reducer inputs by keys (since different mappers may have output the same key) in this stage. - -The shuffle and sort phases occur simultaneously; while map-outputs are being fetched they are merged. - -Secondary Sort -If equivalence rules for grouping the intermediate keys are required to be different from those for grouping keys before reduction, then one may specify a Comparator via JobConf.setOutputValueGroupingComparator(Class). Since JobConf.setOutputKeyComparatorClass(Class) can be used to control how intermediate keys are grouped, these can be used in conjunction to simulate secondary sort on values. - -Reduce -In this phase the reduce(WritableComparable, Iterator, OutputCollector, Reporter) method is called for each pair in the grouped inputs. - -The output of the reduce task is typically written to the FileSystem via OutputCollector.collect(WritableComparable, Writable). - -Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. - -The output of the Reducer is not sorted. - -How Many Reduces? -The right number of reduces seems to be 0.95 or 1.75 multiplied by ( * mapred.tasktracker.reduce.tasks.maximum). - -With 0.95 all of the reduces can launch immediately and start transfering map outputs as the maps finish. With 1.75 the faster nodes will finish their first round of reduces and launch a second wave of reduces doing a much better job of load balancing. - -Increasing the number of reduces increases the framework overhead, but increases load balancing and lowers the cost of failures. - -The scaling factors above are slightly less than whole numbers to reserve a few reduce slots in the framework for speculative-tasks and failed tasks. - -Reducer NONE -It is legal to set the number of reduce-tasks to zero if no reduction is desired. - -In this case the outputs of the map-tasks go directly to the FileSystem, into the output path set by setOutputPath(Path). The framework does not sort the map-outputs before writing them out to the FileSystem. - -Partitioner -Partitioner partitions the key space. - -Partitioner controls the partitioning of the keys of the intermediate map-outputs. The key (or a subset of the key) is used to derive the partition, typically by a hash function. The total number of partitions is the same as the number of reduce tasks for the job. Hence this controls which of the m reduce tasks the intermediate key (and hence the record) is sent to for reduction. - -HashPartitioner is the default Partitioner. - -Reporter -Reporter is a facility for MapReduce applications to report progress, set application-level status messages and update Counters. - -Mapper and Reducer implementations can use the Reporter to report progress or just indicate that they are alive. In scenarios where the application takes a significant amount of time to process individual key/value pairs, this is crucial since the framework might assume that the task has timed-out and kill that task. Another way to avoid this is to set the configuration parameter mapred.task.timeout to a high-enough value (or even set it to zero for no time-outs). - -Applications can also update Counters using the Reporter. - -OutputCollector -OutputCollector is a generalization of the facility provided by the MapReduce framework to collect data output by the Mapper or the Reducer (either the intermediate outputs or the output of the job). - -Hadoop MapReduce comes bundled with a library of generally useful mappers, reducers, and partitioners. - -Job Configuration -JobConf represents a MapReduce job configuration. - -JobConf is the primary interface for a user to describe a MapReduce job to the Hadoop framework for execution. The framework tries to faithfully execute the job as described by JobConf, however: - -f Some configuration parameters may have been marked as final by administrators and hence cannot be altered. -While some job parameters are straight-forward to set (e.g. setNumReduceTasks(int)), other parameters interact subtly with the rest of the framework and/or job configuration and are more complex to set (e.g. setNumMapTasks(int)). -JobConf is typically used to specify the Mapper, combiner (if any), Partitioner, Reducer, InputFormat, OutputFormat and OutputCommitter implementations. JobConf also indicates the set of input files (setInputPaths(JobConf, Path...) /addInputPath(JobConf, Path)) and (setInputPaths(JobConf, String) /addInputPaths(JobConf, String)) and where the output files should be written (setOutputPath(Path)). - -Optionally, JobConf is used to specify other advanced facets of the job such as the Comparator to be used, files to be put in the DistributedCache, whether intermediate and/or job outputs are to be compressed (and how), debugging via user-provided scripts (setMapDebugScript(String)/setReduceDebugScript(String)) , whether job tasks can be executed in a speculative manner (setMapSpeculativeExecution(boolean))/(setReduceSpeculativeExecution(boolean)) , maximum number of attempts per task (setMaxMapAttempts(int)/setMaxReduceAttempts(int)) , percentage of tasks failure which can be tolerated by the job (setMaxMapTaskFailuresPercent(int)/setMaxReduceTaskFailuresPercent(int)) etc. - -Of course, users can use set(String, String)/get(String, String) to set/get arbitrary parameters needed by applications. However, use the DistributedCache for large amounts of (read-only) data. - -Task Execution & Environment -The TaskTracker executes the Mapper/ Reducer task as a child process in a separate jvm. - -The child-task inherits the environment of the parent TaskTracker. The user can specify additional options to the child-jvm via the mapred.{map|reduce}.child.java.opts configuration parameter in the JobConf such as non-standard paths for the run-time linker to search shared libraries via -Djava.library.path=<> etc. If the mapred.{map|reduce}.child.java.opts parameters contains the symbol @taskid@ it is interpolated with value of taskid of the MapReduce task. - -Here is an example with multiple arguments and substitutions, showing jvm GC logging, and start of a passwordless JVM JMX agent so that it can connect with jconsole and the likes to watch child memory, threads and get thread dumps. It also sets the maximum heap-size of the map and reduce child jvm to 512MB & 1024MB respectively. It also adds an additional path to the java.library.path of the child-jvm. - - - mapred.map.child.java.opts - - -Xmx512M -Djava.library.path=/home/mycompany/lib -verbose:gc -Xloggc:/tmp/@taskid@.gc - -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false - - - - - mapred.reduce.child.java.opts - - -Xmx1024M -Djava.library.path=/home/mycompany/lib -verbose:gc -Xloggc:/tmp/@taskid@.gc - -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false - - - -Memory Management -Users/admins can also specify the maximum virtual memory of the launched child-task, and any sub-process it launches recursively, using mapred.{map|reduce}.child.ulimit. Note that the value set here is a per process limit. The value for mapred.{map|reduce}.child.ulimit should be specified in kilo bytes (KB). And also the value must be greater than or equal to the -Xmx passed to JavaVM, else the VM might not start. - -Note: mapred.{map|reduce}.child.java.opts are used only for configuring the launched child tasks from task tracker. Configuring the memory options for daemons is documented in Configuring the Environment of the Hadoop Daemons. - -The memory available to some parts of the framework is also configurable. In map and reduce tasks, performance may be influenced by adjusting parameters influencing the concurrency of operations and the frequency with which data will hit disk. Monitoring the filesystem counters for a job- particularly relative to byte counts from the map and into the reduce- is invaluable to the tuning of these parameters. - -Users can choose to override default limits of Virtual Memory and RAM enforced by the task tracker, if memory management is enabled. Users can set the following parameter per job: - -Name Type Description -mapred.task.maxvmem int A number, in bytes, that represents the maximum Virtual Memory task-limit for each task of the job. A task will be killed if it consumes more Virtual Memory than this number. -mapred.task.maxpmem int A number, in bytes, that represents the maximum RAM task-limit for each task of the job. This number can be optionally used by Schedulers to prevent over-scheduling of tasks on a node based on RAM needs. -Map Parameters -A record emitted from a map will be serialized into a buffer and metadata will be stored into accounting buffers. As described in the following options, when either the serialization buffer or the metadata exceed a threshold, the contents of the buffers will be sorted and written to disk in the background while the map continues to output records. If either buffer fills completely while the spill is in progress, the map thread will block. When the map is finished, any remaining records are written to disk and all on-disk segments are merged into a single file. Minimizing the number of spills to disk can decrease map time, but a larger buffer also decreases the memory available to the mapper. - -Name Type Description -io.sort.mb int The cumulative size of the serialization and accounting buffers storing records emitted from the map, in megabytes. -io.sort.record.percent float The ratio of serialization to accounting space can be adjusted. Each serialized record requires 16 bytes of accounting information in addition to its serialized size to effect the sort. This percentage of space allocated from io.sort.mb affects the probability of a spill to disk being caused by either exhaustion of the serialization buffer or the accounting space. Clearly, for a map outputting small records, a higher value than the default will likely decrease the number of spills to disk. -io.sort.spill.percent float This is the threshold for the accounting and serialization buffers. When this percentage of either buffer has filled, their contents will be spilled to disk in the background. Let io.sort.record.percent be r, io.sort.mb be x, and this value be q. The maximum number of records collected before the collection thread will spill is r * x * q * 2^16. Note that a higher value may decrease the number of- or even eliminate- merges, but will also increase the probability of the map task getting blocked. The lowest average map times are usually obtained by accurately estimating the size of the map output and preventing multiple spills. -Other notes - -If either spill threshold is exceeded while a spill is in progress, collection will continue until the spill is finished. For example, if io.sort.buffer.spill.percent is set to 0.33, and the remainder of the buffer is filled while the spill runs, the next spill will include all the collected records, or 0.66 of the buffer, and will not generate additional spills. In other words, the thresholds are defining triggers, not blocking. -A record larger than the serialization buffer will first trigger a spill, then be spilled to a separate file. It is undefined whether or not this record will first pass through the combiner. -Shuffle/Reduce Parameters -As described previously, each reduce fetches the output assigned to it by the Partitioner via HTTP into memory and periodically merges these outputs to disk. If intermediate compression of map outputs is turned on, each output is decompressed into memory. The following options affect the frequency of these merges to disk prior to the reduce and the memory allocated to map output during the reduce. - -Name Type Description -io.sort.factor int Specifies the number of segments on disk to be merged at the same time. It limits the number of open files and compression codecs during the merge. If the number of files exceeds this limit, the merge will proceed in several passes. Though this limit also applies to the map, most jobs should be configured so that hitting this limit is unlikely there. -mapred.inmem.merge.threshold int The number of sorted map outputs fetched into memory before being merged to disk. Like the spill thresholds in the preceding note, this is not defining a unit of partition, but a trigger. In practice, this is usually set very high (1000) or disabled (0), since merging in-memory segments is often less expensive than merging from disk (see notes following this table). This threshold influences only the frequency of in-memory merges during the shuffle. -mapred.job.shuffle.merge.percent float The memory threshold for fetched map outputs before an in-memory merge is started, expressed as a percentage of memory allocated to storing map outputs in memory. Since map outputs that can't fit in memory can be stalled, setting this high may decrease parallelism between the fetch and merge. Conversely, values as high as 1.0 have been effective for reduces whose input can fit entirely in memory. This parameter influences only the frequency of in-memory merges during the shuffle. -mapred.job.shuffle.input.buffer.percent float The percentage of memory- relative to the maximum heapsize as typically specified in mapred.reduce.child.java.opts- that can be allocated to storing map outputs during the shuffle. Though some memory should be set aside for the framework, in general it is advantageous to set this high enough to store large and numerous map outputs. -mapred.job.reduce.input.buffer.percent float The percentage of memory relative to the maximum heapsize in which map outputs may be retained during the reduce. When the reduce begins, map outputs will be merged to disk until those that remain are under the resource limit this defines. By default, all map outputs are merged to disk before the reduce begins to maximize the memory available to the reduce. For less memory-intensive reduces, this should be increased to avoid trips to disk. -Other notes - -If a map output is larger than 25 percent of the memory allocated to copying map outputs, it will be written directly to disk without first staging through memory. -When running with a combiner, the reasoning about high merge thresholds and large buffers may not hold. For merges started before all map outputs have been fetched, the combiner is run while spilling to disk. In some cases, one can obtain better reduce times by spending resources combining map outputs- making disk spills small and parallelizing spilling and fetching- rather than aggressively increasing buffer sizes. -When merging in-memory map outputs to disk to begin the reduce, if an intermediate merge is necessary because there are segments to spill and at least io.sort.factor segments already on disk, the in-memory map outputs will be part of the intermediate merge. -Directory Structure -The task tracker has local directory, ${mapred.local.dir}/taskTracker/ to create localized cache and localized job. It can define multiple local directories (spanning multiple disks) and then each filename is assigned to a semi-random local directory. When the job starts, task tracker creates a localized job directory relative to the local directory specified in the configuration. Thus the task tracker directory structure looks as following: - -${mapred.local.dir}/taskTracker/distcache/ : The public distributed cache for the jobs of all users. This directory holds the localized public distributed cache. Thus localized public distributed cache is shared among all the tasks and jobs of all users. -${mapred.local.dir}/taskTracker/$user/distcache/ : The private distributed cache for the jobs of the specific user. This directory holds the localized private distributed cache. Thus localized private distributed cache is shared among all the tasks and jobs of the specific user only. It is not accessible to jobs of other users. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/ : The localized job directory -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/work/ : The job-specific shared directory. The tasks can use this space as scratch space and share files among them. This directory is exposed to the users through the configuration property job.local.dir. The directory can accessed through the API JobConf.getJobLocalDir(). It is available as System property also. So, users (streaming etc.) can call System.getProperty("job.local.dir") to access the directory. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/jars/ : The jars directory, which has the job jar file and expanded jar. The job.jar is the application's jar file that is automatically distributed to each machine. It is expanded in jars directory before the tasks for the job start. The job.jar location is accessible to the application through the api JobConf.getJar() . To access the unjarred directory, JobConf.getJar().getParent() can be called. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/job.xml : The job.xml file, the generic job configuration, localized for the job. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid : The task directory for each task attempt. Each task directory again has the following structure : -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/job.xml : A job.xml file, task localized job configuration, Task localization means that properties have been set that are specific to this particular task within the job. The properties localized for each task are described below. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/output : A directory for intermediate output files. This contains the temporary map reduce data generated by the framework such as map output files etc. -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/work : The current working directory of the task. With jvm reuse enabled for tasks, this directory will be the directory on which the jvm has started -${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/work/tmp : The temporary directory for the task. (User can specify the property mapred.child.tmp to set the value of temporary directory for map and reduce tasks. This defaults to ./tmp. If the value is not an absolute path, it is prepended with task's working directory. Otherwise, it is directly assigned. The directory will be created if it doesn't exist. Then, the child java tasks are executed with option -Djava.io.tmpdir='the absolute path of the tmp dir'. Pipes and streaming are set with environment variable, TMPDIR='the absolute path of the tmp dir'). This directory is created, if mapred.child.tmp has the value ./tmp -Task JVM Reuse -Jobs can enable task JVMs to be reused by specifying the job configuration mapred.job.reuse.jvm.num.tasks. If the value is 1 (the default), then JVMs are not reused (i.e. 1 task per JVM). If it is -1, there is no limit to the number of tasks a JVM can run (of the same job). One can also specify some value greater than 1 using the api JobConf.setNumTasksToExecutePerJvm(int) - -Configured Parameters -The following properties are localized in the job configuration for each task's execution: - -Name Type Description -mapred.job.id String The job id -mapred.jar String job.jar location in job directory -job.local.dir String The job specific shared scratch space -mapred.tip.id String The task id -mapred.task.id String The task attempt id -mapred.task.is.map boolean Is this a map task -mapred.task.partition int The id of the task within the job -map.input.file String The filename that the map is reading from -map.input.start long The offset of the start of the map input split -map.input.length long The number of bytes in the map input split -mapred.work.output.dir String The task's temporary output directory -Note: During the execution of a streaming job, the names of the "mapred" parameters are transformed. The dots ( . ) become underscores ( _ ). For example, mapred.job.id becomes mapred_job_id and mapred.jar becomes mapred_jar. To get the values in a streaming job's mapper/reducer use the parameter names with the underscores. - -Task Logs -The standard output (stdout) and error (stderr) streams of the task are read by the TaskTracker and logged to ${HADOOP_LOG_DIR}/userlogs - -Distributing Libraries -The DistributedCache can also be used to distribute both jars and native libraries for use in the map and/or reduce tasks. The child-jvm always has its current working directory added to the java.library.path and LD_LIBRARY_PATH. And hence the cached libraries can be loaded via System.loadLibrary or System.load. More details on how to load shared libraries through distributed cache are documented at native_libraries.html - -Job Submission and Monitoring -JobClient is the primary interface by which user-job interacts with the JobTracker. - -JobClient provides facilities to submit jobs, track their progress, access component-tasks' reports and logs, get the MapReduce cluster's status information and so on. - -The job submission process involves: - -Checking the input and output specifications of the job. -Computing the InputSplit values for the job. -Setting up the requisite accounting information for the DistributedCache of the job, if necessary. -Copying the job's jar and configuration to the MapReduce system directory on the FileSystem. -Submitting the job to the JobTracker and optionally monitoring it's status. -Job history files are also logged to user specified directory hadoop.job.history.user.location which defaults to job output directory. The files are stored in "_logs/history/" in the specified directory. Hence, by default they will be in mapred.output.dir/_logs/history. User can stop logging by giving the value none for hadoop.job.history.user.location - -User can view the history logs summary in specified directory using the following command -$ bin/hadoop job -history output-dir -This command will print job details, failed and killed tip details. -More details about the job such as successful tasks and task attempts made for each task can be viewed using the following command -$ bin/hadoop job -history all output-dir -User can use OutputLogFilter to filter log files from the output directory listing. - -Normally the user creates the application, describes various facets of the job via JobConf, and then uses the JobClient to submit the job and monitor its progress. - -Job Authorization -Job level authorization and queue level authorization are enabled on the cluster, if the configuration mapred.acls.enabled is set to true. When enabled, access control checks are done by (a) the JobTracker before allowing users to submit jobs to queues and administering these jobs and (b) by the JobTracker and the TaskTracker before allowing users to view job details or to modify a job using MapReduce APIs, CLI or web user interfaces. - -A job submitter can specify access control lists for viewing or modifying a job via the configuration properties mapreduce.job.acl-view-job and mapreduce.job.acl-modify-job respectively. By default, nobody is given access in these properties. - -However, irrespective of the job ACLs configured, a job's owner, the superuser and cluster administrators (mapreduce.cluster.administrators) and queue administrators of the queue to which the job was submitted to (mapred.queue.queue-name.acl-administer-jobs) always have access to view and modify a job. - -A job view ACL authorizes users against the configured mapreduce.job.acl-view-job before returning possibly sensitive information about a job, like: - -job level counters -task level counters -tasks's diagnostic information -task logs displayed on the TaskTracker web UI -job.xml showed by the JobTracker's web UI -Other information about a job, like its status and its profile, is accessible to all users, without requiring authorization. - -A job modification ACL authorizes users against the configured mapreduce.job.acl-modify-job before allowing modifications to jobs, like: - -killing a job -killing/failing a task of a job -setting the priority of a job -These operations are also permitted by the queue level ACL, "mapred.queue.queue-name.acl-administer-jobs", configured via mapred-queue-acls.xml. The caller will be able to do the operation if he/she is part of either queue admins ACL or job modification ACL. - -The format of a job level ACL is the same as the format for a queue level ACL as defined in the Cluster Setup documentation. - -Job Control -Users may need to chain MapReduce jobs to accomplish complex tasks which cannot be done via a single MapReduce job. This is fairly easy since the output of the job typically goes to distributed file-system, and the output, in turn, can be used as the input for the next job. - -However, this also means that the onus on ensuring jobs are complete (success/failure) lies squarely on the clients. In such cases, the various job-control options are: - -runJob(JobConf) : Submits the job and returns only after the job has completed. -submitJob(JobConf) : Only submits the job, then poll the returned handle to the RunningJob to query status and make scheduling decisions. -JobConf.setJobEndNotificationURI(String) : Sets up a notification upon job-completion, thus avoiding polling. -Job Credentials -In a secure cluster, the user is authenticated via Kerberos' kinit command. Because of scalability concerns, we don't push the client's Kerberos' tickets in MapReduce jobs. Instead, we acquire delegation tokens from each HDFS NameNode that the job will use and store them in the job as part of job submission. The delegation tokens are automatically obtained for the HDFS that holds the staging directories, where the job job files are written, and any HDFS systems referenced by FileInputFormats, FileOutputFormats, DistCp, and the distributed cache. Other applications require to set the configuration "mapreduce.job.hdfs-servers" for all NameNodes that tasks might need to talk during the job execution. This is a comma separated list of file system names, such as "hdfs://nn1/,hdfs://nn2/". These tokens are passed to the JobTracker as part of the job submission as Credentials. - -Similar to HDFS delegation tokens, we also have MapReduce delegation tokens. The MapReduce tokens are provided so that tasks can spawn jobs if they wish to. The tasks authenticate to the JobTracker via the MapReduce delegation tokens. The delegation token can be obtained via the API in JobClient.getDelegationToken. The obtained token must then be pushed onto the credentials that is there in the JobConf used for job submission. The API Credentials.addToken can be used for this. - -The credentials are sent to the JobTracker as part of the job submission process. The JobTracker persists the tokens and secrets in its filesystem (typically HDFS) in a file within mapred.system.dir/JOBID. The TaskTracker localizes the file as part job localization. Tasks see an environment variable called HADOOP_TOKEN_FILE_LOCATION and the framework sets this to point to the localized file. In order to launch jobs from tasks or for doing any HDFS operation, tasks must set the configuration "mapreduce.job.credentials.binary" to point to this token file. - -The HDFS delegation tokens passed to the JobTracker during job submission are are cancelled by the JobTracker when the job completes. This is the default behavior unless mapreduce.job.complete.cancel.delegation.tokens is set to false in the JobConf. For jobs whose tasks in turn spawns jobs, this should be set to false. Applications sharing JobConf objects between multiple jobs on the JobClient side should look at setting mapreduce.job.complete.cancel.delegation.tokens to false. This is because the Credentials object within the JobConf will then be shared. All jobs will end up sharing the same tokens, and hence the tokens should not be canceled when the jobs in the sequence finish. - -Apart from the HDFS delegation tokens, arbitrary secrets can also be passed during the job submission for tasks to access other third party services. The APIs JobConf.getCredentials or JobContext.getCredentials() should be used to get the credentials object and then Credentials.addSecretKey should be used to add secrets. - -For applications written using the old MapReduce API, the Mapper/Reducer classes need to implement JobConfigurable in order to get access to the credentials in the tasks. A reference to the JobConf passed in the JobConfigurable.configure should be stored. In the new MapReduce API, a similar thing can be done in the Mapper.setup method. The api JobConf.getCredentials() or the api JobContext.getCredentials() should be used to get the credentials reference (depending on whether the new MapReduce API or the old MapReduce API is used). Tasks can access the secrets using the APIs in Credentials - -Job Input -InputFormat describes the input-specification for a MapReduce job. - -The MapReduce framework relies on the InputFormat of the job to: - -Validate the input-specification of the job. -Split-up the input file(s) into logical InputSplit instances, each of which is then assigned to an individual Mapper. -Provide the RecordReader implementation used to glean input records from the logical InputSplit for processing by the Mapper. -The default behavior of file-based InputFormat implementations, typically sub-classes of FileInputFormat, is to split the input into logical InputSplit instances based on the total size, in bytes, of the input files. However, the FileSystem blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size. - -Clearly, logical splits based on input-size is insufficient for many applications since record boundaries must be respected. In such cases, the application should implement a RecordReader, who is responsible for respecting record-boundaries and presents a record-oriented view of the logical InputSplit to the individual task. - -TextInputFormat is the default InputFormat. - -If TextInputFormat is the InputFormat for a given job, the framework detects input-files with the .gz extensions and automatically decompresses them using the appropriate CompressionCodec. However, it must be noted that compressed files with the above extensions cannot be split and each compressed file is processed in its entirety by a single mapper. - -InputSplit -InputSplit represents the data to be processed by an individual Mapper. - -Typically InputSplit presents a byte-oriented view of the input, and it is the responsibility of RecordReader to process and present a record-oriented view. - -FileSplit is the default InputSplit. It sets map.input.file to the path of the input file for the logical split. - -RecordReader -RecordReader reads pairs from an InputSplit. - -Typically the RecordReader converts the byte-oriented view of the input, provided by the InputSplit, and presents a record-oriented to the Mapper implementations for processing. RecordReader thus assumes the responsibility of processing record boundaries and presents the tasks with keys and values. - -Job Output -OutputFormat describes the output-specification for a MapReduce job. - -The MapReduce framework relies on the OutputFormat of the job to: - -Validate the output-specification of the job; for example, check that the output directory doesn't already exist. -Provide the RecordWriter implementation used to write the output files of the job. Output files are stored in a FileSystem. -TextOutputFormat is the default OutputFormat. - -OutputCommitter -OutputCommitter describes the commit of task output for a MapReduce job. - -The MapReduce framework relies on the OutputCommitter of the job to: - -Setup the job during initialization. For example, create the temporary output directory for the job during the initialization of the job. Job setup is done by a separate task when the job is in PREP state and after initializing tasks. Once the setup task completes, the job will be moved to RUNNING state. -Cleanup the job after the job completion. For example, remove the temporary output directory after the job completion. Job cleanup is done by a separate task at the end of the job. Job is declared SUCCEDED/FAILED/KILLED after the cleanup task completes. -Setup the task temporary output. Task setup is done as part of the same task, during task initialization. -Check whether a task needs a commit. This is to avoid the commit procedure if a task does not need commit. -Commit of the task output. Once task is done, the task will commit it's output if required. -Discard the task commit. If the task has been failed/killed, the output will be cleaned-up. If task could not cleanup (in exception block), a separate task will be launched with same attempt-id to do the cleanup. -FileOutputCommitter is the default OutputCommitter. Job setup/cleanup tasks occupy map or reduce slots, whichever is free on the TaskTracker. And JobCleanup task, TaskCleanup tasks and JobSetup task have the highest priority, and in that order. - -Task Side-Effect Files -In some applications, component tasks need to create and/or write to side-files, which differ from the actual job-output files. - -In such cases there could be issues with two instances of the same Mapper or Reducer running simultaneously (for example, speculative tasks) trying to open and/or write to the same file (path) on the FileSystem. Hence the application-writer will have to pick unique names per task-attempt (using the attemptid, say attempt_200709221812_0001_m_000000_0), not just per task. - -To avoid these issues the MapReduce framework, when the OutputCommitter is FileOutputCommitter, maintains a special ${mapred.output.dir}/_temporary/_${taskid} sub-directory accessible via ${mapred.work.output.dir} for each task-attempt on the FileSystem where the output of the task-attempt is stored. On successful completion of the task-attempt, the files in the ${mapred.output.dir}/_temporary/_${taskid} (only) are promoted to ${mapred.output.dir}. Of course, the framework discards the sub-directory of unsuccessful task-attempts. This process is completely transparent to the application. - -The application-writer can take advantage of this feature by creating any side-files required in ${mapred.work.output.dir} during execution of a task via FileOutputFormat.getWorkOutputPath(), and the framework will promote them similarly for succesful task-attempts, thus eliminating the need to pick unique paths per task-attempt. - -Note: The value of ${mapred.work.output.dir} during execution of a particular task-attempt is actually ${mapred.output.dir}/_temporary/_{$taskid}, and this value is set by the MapReduce framework. So, just create any side-files in the path returned by FileOutputFormat.getWorkOutputPath() from MapReduce task to take advantage of this feature. - -The entire discussion holds true for maps of jobs with reducer=NONE (i.e. 0 reduces) since output of the map, in that case, goes directly to HDFS. - -RecordWriter -RecordWriter writes the output pairs to an output file. - -RecordWriter implementations write the job outputs to the FileSystem. - -Other Useful Features -Submitting Jobs to Queues -Users submit jobs to Queues. Queues, as collection of jobs, allow the system to provide specific functionality. For example, queues use ACLs to control which users who can submit jobs to them. Queues are expected to be primarily used by Hadoop Schedulers. - -Hadoop comes configured with a single mandatory queue, called 'default'. Queue names are defined in the mapred.queue.names property of the Hadoop site configuration. Some job schedulers, such as the Capacity Scheduler, support multiple queues. - -A job defines the queue it needs to be submitted to through the mapred.job.queue.name property, or through the setQueueName(String) API. Setting the queue name is optional. If a job is submitted without an associated queue name, it is submitted to the 'default' queue. - -Counters -Counters represent global counters, defined either by the MapReduce framework or applications. Each Counter can be of any Enum type. Counters of a particular Enum are bunched into groups of type Counters.Group. - -Applications can define arbitrary Counters (of type Enum) and update them via Reporter.incrCounter(Enum, long) or Reporter.incrCounter(String, String, long) in the map and/or reduce methods. These counters are then globally aggregated by the framework. - -DistributedCache -DistributedCache distributes application-specific, large, read-only files efficiently. - -DistributedCache is a facility provided by the MapReduce framework to cache files (text, archives, jars and so on) needed by applications. - -Applications specify the files to be cached via urls (hdfs://) in the JobConf. The DistributedCache assumes that the files specified via hdfs:// urls are already present on the FileSystem. - -The framework will copy the necessary files to the slave node before any tasks for the job are executed on that node. Its efficiency stems from the fact that the files are only copied once per job and the ability to cache archives which are un-archived on the slaves. - -DistributedCache tracks the modification timestamps of the cached files. Clearly the cache files should not be modified by the application or externally while the job is executing. - -DistributedCache can be used to distribute simple, read-only data/text files and more complex types such as archives and jars. Archives (zip, tar, tgz and tar.gz files) are un-archived at the slave nodes. Files have execution permissions set. - -The files/archives can be distributed by setting the property mapred.cache.{files|archives}. If more than one file/archive has to be distributed, they can be added as comma separated paths. The properties can also be set by APIs DistributedCache.addCacheFile(URI,conf)/ DistributedCache.addCacheArchive(URI,conf) and DistributedCache.setCacheFiles(URIs,conf)/ DistributedCache.setCacheArchives(URIs,conf) where URI is of the form hdfs://host:port/absolute-path#link-name. In Streaming, the files can be distributed through command line option -cacheFile/-cacheArchive. - -Optionally users can also direct the DistributedCache to symlink the cached file(s) into the current working directory of the task via the DistributedCache.createSymlink(Configuration) api. Or by setting the configuration property mapred.create.symlink as yes. The DistributedCache will use the fragment of the URI as the name of the symlink. For example, the URI hdfs://namenode:port/lib.so.1#lib.so will have the symlink name as lib.so in task's cwd for the file lib.so.1 in distributed cache. - -The DistributedCache can also be used as a rudimentary software distribution mechanism for use in the map and/or reduce tasks. It can be used to distribute both jars and native libraries. The DistributedCache.addArchiveToClassPath(Path, Configuration) or DistributedCache.addFileToClassPath(Path, Configuration) api can be used to cache files/jars and also add them to the classpath of child-jvm. The same can be done by setting the configuration properties mapred.job.classpath.{files|archives}. Similarly the cached files that are symlinked into the working directory of the task can be used to distribute native libraries and load them. - -Private and Public DistributedCache Files -DistributedCache files can be private or public, that determines how they can be shared on the slave nodes. - -"Private" DistributedCache files are cached in a local directory private to the user whose jobs need these files. These files are shared by all tasks and jobs of the specific user only and cannot be accessed by jobs of other users on the slaves. A DistributedCache file becomes private by virtue of its permissions on the file system where the files are uploaded, typically HDFS. If the file has no world readable access, or if the directory path leading to the file has no world executable access for lookup, then the file becomes private. -"Public" DistributedCache files are cached in a global directory and the file access is setup such that they are publicly visible to all users. These files can be shared by tasks and jobs of all users on the slaves. A DistributedCache file becomes public by virtue of its permissions on the file system where the files are uploaded, typically HDFS. If the file has world readable access, AND if the directory path leading to the file has world executable access for lookup, then the file becomes public. In other words, if the user intends to make a file publicly available to all users, the file permissions must be set to be world readable, and the directory permissions on the path leading to the file must be world executable. -Tool -The Tool interface supports the handling of generic Hadoop command-line options. - -Tool is the standard for any MapReduce tool or application. The application should delegate the handling of standard command-line options to GenericOptionsParser via ToolRunner.run(Tool, String[]) and only handle its custom arguments. - -The generic Hadoop command-line options are: --conf --D --fs --jt - -IsolationRunner -IsolationRunner is a utility to help debug MapReduce programs. - -To use the IsolationRunner, first set keep.failed.task.files to true (also see keep.task.files.pattern). - -Next, go to the node on which the failed task ran and go to the TaskTracker's local directory and run the IsolationRunner: -$ cd /taskTracker/${taskid}/work -$ bin/hadoop org.apache.hadoop.mapred.IsolationRunner ../job.xml - -IsolationRunner will run the failed task in a single jvm, which can be in the debugger, over precisely the same input. - -Note that currently IsolationRunner will only re-run map tasks. - -Profiling -Profiling is a utility to get a representative (2 or 3) sample of built-in java profiler for a sample of maps and reduces. - -User can specify whether the system should collect profiler information for some of the tasks in the job by setting the configuration property mapred.task.profile. The value can be set using the api JobConf.setProfileEnabled(boolean). If the value is set true, the task profiling is enabled. The profiler information is stored in the user log directory. By default, profiling is not enabled for the job. - -Once user configures that profiling is needed, she/he can use the configuration property mapred.task.profile.{maps|reduces} to set the ranges of MapReduce tasks to profile. The value can be set using the api JobConf.setProfileTaskRange(boolean,String). By default, the specified range is 0-2. - -User can also specify the profiler configuration arguments by setting the configuration property mapred.task.profile.params. The value can be specified using the api JobConf.setProfileParams(String). If the string contains a %s, it will be replaced with the name of the profiling output file when the task runs. These parameters are passed to the task child JVM on the command line. The default value for the profiling parameters is -agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s - -Debugging -The MapReduce framework provides a facility to run user-provided scripts for debugging. When a MapReduce task fails, a user can run a debug script, to process task logs for example. The script is given access to the task's stdout and stderr outputs, syslog and jobconf. The output from the debug script's stdout and stderr is displayed on the console diagnostics and also as part of the job UI. - -In the following sections we discuss how to submit a debug script with a job. The script file needs to be distributed and submitted to the framework. - -How to distribute the script file: -The user needs to use DistributedCache to distribute and symlink the script file. - -How to submit the script: -A quick way to submit the debug script is to set values for the properties mapred.map.task.debug.script and mapred.reduce.task.debug.script, for debugging map and reduce tasks respectively. These properties can also be set by using APIs JobConf.setMapDebugScript(String) and JobConf.setReduceDebugScript(String) . In streaming mode, a debug script can be submitted with the command-line options -mapdebug and -reducedebug, for debugging map and reduce tasks respectively. - -The arguments to the script are the task's stdout, stderr, syslog and jobconf files. The debug command, run on the node where the MapReduce task failed, is: -$script $stdout $stderr $syslog $jobconf - -Pipes programs have the c++ program name as a fifth argument for the command. Thus for the pipes programs the command is -$script $stdout $stderr $syslog $jobconf $program - -Default Behavior: -For pipes, a default script is run to process core dumps under gdb, prints stack trace and gives info about running threads. - -JobControl -JobControl is a utility which encapsulates a set of MapReduce jobs and their dependencies. - -Data Compression -Hadoop MapReduce provides facilities for the application-writer to specify compression for both intermediate map-outputs and the job-outputs i.e. output of the reduces. It also comes bundled with CompressionCodec implementation for the zlib compression algorithm. The gzip file format is also supported. - -Hadoop also provides native implementations of the above compression codecs for reasons of both performance (zlib) and non-availability of Java libraries. More details on their usage and availability are available here. - -Intermediate Outputs -Applications can control compression of intermediate map-outputs via the JobConf.setCompressMapOutput(boolean) api and the CompressionCodec to be used via the JobConf.setMapOutputCompressorClass(Class) api. - -Job Outputs -Applications can control compression of job-outputs via the FileOutputFormat.setCompressOutput(JobConf, boolean) api and the CompressionCodec to be used can be specified via the FileOutputFormat.setOutputCompressorClass(JobConf, Class) api. - -If the job outputs are to be stored in the SequenceFileOutputFormat, the required SequenceFile.CompressionType (i.e. RECORD / BLOCK - defaults to RECORD) can be specified via the SequenceFileOutputFormat.setOutputCompressionType(JobConf, SequenceFile.CompressionType) api. - -Skipping Bad Records -Hadoop provides an option where a certain set of bad input records can be skipped when processing map inputs. Applications can control this feature through the SkipBadRecords class. - -This feature can be used when map tasks crash deterministically on certain input. This usually happens due to bugs in the map function. Usually, the user would have to fix these bugs. This is, however, not possible sometimes. The bug may be in third party libraries, for example, for which the source code is not available. In such cases, the task never completes successfully even after multiple attempts, and the job fails. With this feature, only a small portion of data surrounding the bad records is lost, which may be acceptable for some applications (those performing statistical analysis on very large data, for example). - -By default this feature is disabled. For enabling it, refer to SkipBadRecords.setMapperMaxSkipRecords(Configuration, long) and SkipBadRecords.setReducerMaxSkipGroups(Configuration, long). - -With this feature enabled, the framework gets into 'skipping mode' after a certain number of map failures. For more details, see SkipBadRecords.setAttemptsToStartSkipping(Configuration, int). In 'skipping mode', map tasks maintain the range of records being processed. To do this, the framework relies on the processed record counter. See SkipBadRecords.COUNTER_MAP_PROCESSED_RECORDS and SkipBadRecords.COUNTER_REDUCE_PROCESSED_GROUPS. This counter enables the framework to know how many records have been processed successfully, and hence, what record range caused a task to crash. On further attempts, this range of records is skipped. - -The number of records skipped depends on how frequently the processed record counter is incremented by the application. It is recommended that this counter be incremented after every record is processed. This may not be possible in some applications that typically batch their processing. In such cases, the framework may skip additional records surrounding the bad record. Users can control the number of skipped records through SkipBadRecords.setMapperMaxSkipRecords(Configuration, long) and SkipBadRecords.setReducerMaxSkipGroups(Configuration, long). The framework tries to narrow the range of skipped records using a binary search-like approach. The skipped range is divided into two halves and only one half gets executed. On subsequent failures, the framework figures out which half contains bad records. A task will be re-executed till the acceptable skipped value is met or all task attempts are exhausted. To increase the number of task attempts, use JobConf.setMaxMapAttempts(int) and JobConf.setMaxReduceAttempts(int). - -Skipped records are written to HDFS in the sequence file format, for later analysis. The location can be changed through SkipBadRecords.setSkipOutputPath(JobConf, Path). - -Example: WordCount v2.0 -Here is a more complete WordCount which uses many of the features provided by the MapReduce framework we discussed so far. - -This needs the HDFS to be up and running, especially for the DistributedCache-related features. Hence it only works with a pseudo-distributed or fully-distributed Hadoop installation. - -Source Code -WordCount.java -1. package org.myorg; -2. -3. import java.io.*; -4. import java.util.*; -5. -6. import org.apache.hadoop.fs.Path; -7. import org.apache.hadoop.filecache.DistributedCache; -8. import org.apache.hadoop.conf.*; -9. import org.apache.hadoop.io.*; -10. import org.apache.hadoop.mapred.*; -11. import org.apache.hadoop.util.*; -12. -13. public class WordCount extends Configured implements Tool { -14. -15. public static class Map extends MapReduceBase implements Mapper { -16. -17. static enum Counters { INPUT_WORDS } -18. -19. private final static IntWritable one = new IntWritable(1); -20. private Text word = new Text(); -21. -22. private boolean caseSensitive = true; -23. private Set patternsToSkip = new HashSet(); -24. -25. private long numRecords = 0; -26. private String inputFile; -27. -28. public void configure(JobConf job) { -29. caseSensitive = job.getBoolean("wordcount.case.sensitive", true); -30. inputFile = job.get("map.input.file"); -31. -32. if (job.getBoolean("wordcount.skip.patterns", false)) { -33. Path[] patternsFiles = new Path[0]; -34. try { -35. patternsFiles = DistributedCache.getLocalCacheFiles(job); -36. } catch (IOException ioe) { -37. System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe)); -38. } -39. for (Path patternsFile : patternsFiles) { -40. parseSkipFile(patternsFile); -41. } -42. } -43. } -44. -45. private void parseSkipFile(Path patternsFile) { -46. try { -47. BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString())); -48. String pattern = null; -49. while ((pattern = fis.readLine()) != null) { -50. patternsToSkip.add(pattern); -51. } -52. } catch (IOException ioe) { -53. System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe)); -54. } -55. } -56. -57. public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { -58. String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase(); -59. -60. for (String pattern : patternsToSkip) { -61. line = line.replaceAll(pattern, ""); -62. } -63. -64. StringTokenizer tokenizer = new StringTokenizer(line); -65. while (tokenizer.hasMoreTokens()) { -66. word.set(tokenizer.nextToken()); -67. output.collect(word, one); -68. reporter.incrCounter(Counters.INPUT_WORDS, 1); -69. } -70. -71. if ((++numRecords % 100) == 0) { -72. reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile); -73. } -74. } -75. } -76. -77. public static class Reduce extends MapReduceBase implements Reducer { -78. public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { -79. int sum = 0; -80. while (values.hasNext()) { -81. sum += values.next().get(); -82. } -83. output.collect(key, new IntWritable(sum)); -84. } -85. } -86. -87. public int run(String[] args) throws Exception { -88. JobConf conf = new JobConf(getConf(), WordCount.class); -89. conf.setJobName("wordcount"); -90. -91. conf.setOutputKeyClass(Text.class); -92. conf.setOutputValueClass(IntWritable.class); -93. -94. conf.setMapperClass(Map.class); -95. conf.setCombinerClass(Reduce.class); -96. conf.setReducerClass(Reduce.class); -97. -98. conf.setInputFormat(TextInputFormat.class); -99. conf.setOutputFormat(TextOutputFormat.class); -100. -101. List other_args = new ArrayList(); -102. for (int i=0; i < args.length; ++i) { -103. if ("-skip".equals(args[i])) { -104. DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); -105. conf.setBoolean("wordcount.skip.patterns", true); -106. } else { -107. other_args.add(args[i]); -108. } -109. } -110. -111. FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); -112. FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); -113. -114. JobClient.runJob(conf); -115. return 0; -116. } -117. -118. public static void main(String[] args) throws Exception { -119. int res = ToolRunner.run(new Configuration(), new WordCount(), args); -120. System.exit(res); -121. } -122. } -123. -Sample Runs -Sample text-files as input: - -$ bin/hadoop dfs -ls /usr/joe/wordcount/input/ -/usr/joe/wordcount/input/file01 -/usr/joe/wordcount/input/file02 - -$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01 -Hello World, Bye World! - -$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02 -Hello Hadoop, Goodbye to hadoop. - -Run the application: - -$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount /usr/joe/wordcount/input /usr/joe/wordcount/output - -Output: - -$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 -Bye 1 -Goodbye 1 -Hadoop, 1 -Hello 2 -World! 1 -World, 1 -hadoop. 1 -to 1 -Notice that the inputs differ from the first version we looked at, and how they affect the outputs. - -Now, lets plug-in a pattern-file which lists the word-patterns to be ignored, via the DistributedCache. - -$ hadoop dfs -cat /user/joe/wordcount/patterns.txt -\. -\, -\! -to -Run it again, this time with more options: - -$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=true /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt - -As expected, the output: - -$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 -Bye 1 -Goodbye 1 -Hadoop 1 -Hello 2 -World 2 -hadoop 1 -Run it once more, this time switch-off case-sensitivity: - -$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=false /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt - -Sure enough, the output: - -$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 -bye 1 -goodbye 1 -hadoop 2 -hello 2 -world 2 -Highlights -The second version of WordCount improves upon the previous one by using some features offered by the MapReduce framework: - -Demonstrates how applications can access configuration parameters in the configure method of the Mapper (and Reducer) implementations (lines 28-43). -Demonstrates how the DistributedCache can be used to distribute read-only data needed by the jobs. Here it allows the user to specify word-patterns to skip while counting (line 104). -Demonstrates the utility of the Tool interface and the GenericOptionsParser to handle generic Hadoop command-line options (lines 87-116, 119). -Demonstrates how applications can use Counters (line 68) and how they can set application-specific status information via the Reporter instance passed to the map (and reduce) method (line 72). -Java and JNI are trademarks or registered trademarks of Sun Microsystems, Inc. in the United States and other countries. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls deleted file mode 100644 index 9bf78621edcfd8628be103b9c09e30720e6bbebd..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt deleted file mode 100644 index 0a30a228867617e34829ac71e2a021f989105bf7..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt +++ /dev/null @@ -1,97 +0,0 @@ -MapReduce&developer&依赖 -huge amount&unstructured datum&AGGREGATION -MapReduce&unstructured datum&依赖 -MapReduce&NCache cluster&依赖 -MapReduce¶llel&依赖 -MapReduce&huge amount&依赖 -MapReduce&developer&依赖 -MapReduce¶llel&依赖 -NCache cluster&cluster&GENERALIZATION -MapReduce&unstructured datum&依赖 -MapReduce&NCache cluster&依赖 -MapReduce&huge amount&依赖 -MapReduce&size&依赖 -cluster&size&AGGREGATION -MapReduce&node&依赖 -MapReduce&cluster&依赖 -MapReduce¶llel&依赖 -term “ MapReduce ”&two distinct phase&依赖 -‘ Map ’ phase&set&依赖 -set&datum&AGGREGATION -‘ Map ’ phase&datum&依赖 -‘ Reduce ’ phase&output&依赖 -‘ Reduce ’ phase&‘ Map ’&依赖 -‘ Reduce ’ phase&‘ Map ’&依赖 -‘ Reduce ’ phase&output&依赖 -user&key-value pair&依赖 -user&set&依赖 -user&intermediate key-value pair&依赖 -set&intermediate key-value pair&AGGREGATION -user&key-value pair&依赖 -user&intermediate key-value pair&依赖 -user&set&依赖 -Reducer&intermediate key-value pair&依赖 -Reducer&having&依赖 -Reducer&having&依赖 -Reducer&intermediate key-value pair&依赖 -example&combiner )&依赖 -cluster&three node&AGGREGATION -example&a mapreduce task (&依赖 -task&order&依赖 -task&product&依赖 -task&Mapper and extracts count&依赖 -task&order&依赖 -Mapper and extracts count&product&AGGREGATION -Reducer&node& -aggregated count&final aggregation&依赖 -count&figure 2&依赖 -aggregated count&Reducer node&依赖 -Mapper&output& -Reducer node&node&GENERALIZATION -two (&Combining&依赖 -two (&Combining&依赖 -aggregation and compilation&final result&AGGREGATION -care&aggregation and compilation&AGGREGATION -Combine phase&performance&依赖 -it&network traffic&依赖 -it&Mapper and Reducers&依赖 -NCache MapReduce&MapReduce&GENERALIZATION -NCache MapReduce&three phase&依赖 -NCache MapReduce&Map&依赖 -its&reducer& -NCache MapReduce&default reducer&依赖 -user&Reducer&实现 -Default reducer&output&依赖 -Default reducer&output&依赖 -mapper , combiner and reducer&NCache MapReduce task&依赖 -mapper , combiner and reducer&NCache cluster&依赖 -Mapper output&Combiner&依赖 -Mapper output&output&GENERALIZATION -it&Reducer&依赖 -Combiner&output& -Reducer&output&依赖 -’s output&specified chunk size&依赖 -Mapper&chunk& -Number&task&AGGREGATION -combiner or reducer once output chunk&configured chunk size&依赖 -typical MapReduce task&components :&依赖 -Mapper&initial input&依赖 -Combiner Factory&combiner&依赖 -Combiner Factory&combiner&依赖 -Combiner Factory&combiner&依赖 -its&keys& -Key Filter&filter cache datum&依赖 -Key Filter&user&依赖 -KeyFilter&Mapper phase&依赖 -Map&key&依赖 -it&false&依赖 -Mapper&key&依赖 -its&status& -component&track&依赖 -component&progress&依赖 -component&task&依赖 -progress&task&AGGREGATION -track&progress&AGGREGATION -output&task&AGGREGATION -you&output&依赖 -you&task&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt deleted file mode 100644 index 8781313caf4876561408b0c880c13f81b4cba1cf..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt +++ /dev/null @@ -1,48 +0,0 @@ -MapReduce in NCache allows developers to process huge amounts of unstructured data in parallel across an NCache cluster. To distribute input data and analyze it in parallel, MapReduce operates in parallel on all nodes in a cluster of any size. - -MapReduce is a programming model for processing and generating large data sets with a parallel, distributed algorithm on a cluster. The term “MapReduce” refers to two distinct phases. The first phase is ‘Map’ phase, which takes a set of data and converts it into another set of data, where individual items are broken down into key-value pairs. The second phase is ‘Reduce’ phase, which takes output from ‘Map’ as an input and reduces that data set into a smaller and more meaningful data set. - -A user defined Mapper processes a key-value pair to generate a set of intermediate key-value pairs. Reducer processes all those intermediate key-value pairs (having same intermediate key) to aggregate, perform calculations or any other operation on the pairs. Another optional component, Combiner, performs merging of the intermediate key-value pairs generated by Mapper before these key-value pairs can be sent over to the Reducer. - -The following example illustrates a MapReduce task (with and without combiner) being executed over a cluster of three nodes. The task takes orders as an input to the Mapper and extracts count of products consumed in it. In figure 1, Mapper’s output is directly sent to the reducer and is being aggregated on Reducer’s node whereas in figure 2, count over a single node is aggregated first and this aggregated count is sent to the Reducer node for final aggregation. - -MapReduce without Combiner: - -MapReduce in Ncache without Combiner - -MapReduce with Combiner: - -MapReduce in Ncache with Combiner - -How does MapReduce Work? -Generally, MapReduce consists of two (sometimes three) phases: i.e. Mapping, Combining (optional) and Reducing. - -Mapping phase: Filters and prepares the input for the next phase that may be Combining or Reducing. -Reduction phase: Takes care of the aggregation and compilation of the final result. -Combining phase: Responsible for reduction local to the node, before sending the input to the Reducers. Combine phase optimizes performance as it minimizes the network traffic between Mapper and Reducers by sending the output to the Reducer in chunks. -Similarly, NCache MapReduce has three phases: Map, Combine, and Reduce. Only the Mapper is necessary to implement, Reducer and Combiner implementations are optional. NCache MapReduce will execute its default reducer if the user does not implement Reducer. Default reducer merges output omitted by Mapper into an array. - -The Mapper, Combiner and Reducer are executed simultaneously during an NCache MapReduce task on the NCache cluster. Mapper output is individually sent to the Combiner. When Combiner’s output reaches the specified chunk size, it is then sent to the Reducer, which finalizes and persists the output. - -In order to monitor the submitted task, a traceable object is provided to the user. - -Number of tasks to be executed simultaneously and Mapper’s output chunk is configurable. Mapper’s output is sent to combiner or reducer once output chunk reaches the configured chunk size. See NCache Administrator’s Guide. - -A typical MapReduce task has the following components: - -Mapper: Processes the initial input and enables user to emit the output into a dictionary to be used as an input for the combiner or reducer. - -Combiner Factory: creates and manages combiners for each key emitted into output by the mapper. - -Combiner: Works as local reducer to the node where Mapper’s output is combined to minimize traffic between Mapper and Reducer. - -Reducer Factory: create and manages reducers for each key emitted into output by the mapper or combiner. - -Reducer: Processes all those intermediate key-value pairs generated by Mapper or combined by Combiner to aggregate, perform calculations or apply different operations to produce the reduced output. - -Key Filter: Key Filter, as the name indicates, allows the user to filter cache data based on its keys before sent to the Mapper. The KeyFilter is called during Mapper phase. If it returns true, the Map will be executed on the key. If it returns false, Mapper will skip the key and move to next one from the Cache. - -TrackerTask: This component lets you keep track of the progress of the task and its status as the task is executed. And lets you fetch the output of the task and enumerate it. - -Output: The output is stored in-memory, on the server side. It can be enumerated using the TrackableTask instance on the client application. - diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls deleted file mode 100644 index f6609e6302a873309ff95de23a68c65af5124b9f..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls and /dev/null differ diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" deleted file mode 100644 index dffaa920fc61591c098ac7314827be8a24c8cb42..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" +++ /dev/null @@ -1,22 +0,0 @@ -Split&logical representation&依赖 -logical representation&block&AGGREGATION -Split&block&依赖 -map-reduce 1 mapper&1 split&依赖 -map-reduce 1 mapper&map-reduce 1 mapper&依赖 -map-reduce 1 mapper&time&依赖 -10 mapper&input file&依赖 -block size&10 split&依赖 -We&client , master and slave&依赖 -Client&job&依赖 -We&job&依赖 -we&mapper and reducer&依赖 -we&program&依赖 -We&program&依赖 -We&job&依赖 -We&sub-division&依赖 -sub-division&job&AGGREGATION -job&smaller task&依赖 -Master&multiple task&依赖 -Master&work or job&依赖 -actual work&slave&依赖 -Master&job&依赖 diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" deleted file mode 100644 index 084bf15f448b467539bda8b7ed81c110215c9f30..0000000000000000000000000000000000000000 --- "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" +++ /dev/null @@ -1,24 +0,0 @@ -MapReduce – Components: -Split - -Split can be called as a logical representation of block. In map-reduce 1 mapper can process 1 split at a time. - -We have seen in HDFS that the default size can be 64mb or 128mb, then if file size is 1280mb, block size is 128mb than we will have 10 splits, then 10 mappers will run for the input file. - -Job - -We have seen in the HDFS that we have a client, master and slaves. Client configures the job and submits it to the master. We can say job as a program in which we execute mapper and reducer. - -Task - -We can say Task as a sub-division of job. Here job is divided into smaller tasks. Master divides the work or job into multiple tasks and gives them to slaves. The actual work is done by the slaves. - -Here we can also say that Client need to submit the job to Resource Manger which is running on Master, then Master converts that job and submits tasks to the slaves. These all tasks are run parallel and independent on each other. - -Resource Manager - -It is a daemon which runs on Master node. - -Node Manager - -It is a daemon which runs on Slaves. \ No newline at end of file diff --git "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" "b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" deleted file mode 100644 index 3cc9106b1e3ba326be2ace1739b783522c4ab8f1..0000000000000000000000000000000000000000 Binary files "a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt deleted file mode 100644 index 6b4a842419b3cfa5cf143310475c8b7102613bb7..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt +++ /dev/null @@ -1,118 +0,0 @@ -core component&Apache Hadoop software framework&AGGREGATION -MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 -MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 -MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 -distributed processing&commodity computer cluster&依赖 -distributed processing&commodity computer cluster&依赖 -node&cluster&AGGREGATION -node&own storage&依赖 -its&storage& -distributed processing&massive unstructured datum set&AGGREGATION -distributed processing&commodity computer cluster&依赖 -it&result&依赖 -MapReduce&two essential function&依赖 -it&result&依赖 -it&node&依赖 -it&node&依赖 -it&node&依赖 -it&node&依赖 -it&result&依赖 -it&result&依赖 -MapReduce&original version&依赖 -original version&MapReduce&AGGREGATION -MapReduce&MapReduce&依赖 -master node&jobs and resource&依赖 -master node&cluster&依赖 -master node&node&GENERALIZATION -component&completed job&依赖 -previous JobTracker and TaskTracker daemon&introduction&依赖 -previous JobTracker and TaskTracker daemon&component&依赖 -introduction&mapreduce and hadoop version&AGGREGATION -previous JobTracker and TaskTracker daemon&another resource negotiator ( yarn )&依赖 -component&another resource negotiator ( yarn )&AGGREGATION -previous JobTracker and TaskTracker daemon&mapreduce and hadoop version&依赖 -ResourceManager&master node&依赖 -submission and scheduling&job&AGGREGATION -It&job&依赖 -NodeManager&slave node&依赖 -NodeManager&other daemon&依赖 -slave node&node&GENERALIZATION -MapReduce&massive cluster size&依赖 -MapReduce¶llel&依赖 -number&server&AGGREGATION -job&number&依赖 -cluster size&final result&依赖 -job&server&依赖 -job&results& -MapReduce&software development&实现 -MapReduce&C , C++ , Java , Ruby , Perl and Python&依赖 -MapReduce&several language&依赖 -programmer&MapReduce library&依赖 -node&status&依赖 -node&master node&依赖 -its&status& -master node&piece&依赖 -master node&cluster&依赖 -master node&job&依赖 -piece&job&AGGREGATION -master node&other available node&依赖 -its&ability& -result&node&AGGREGATION -power&MapReduce&AGGREGATION -user&time&依赖 -user&number&依赖 -user&time&依赖 -number&time&AGGREGATION -user&number&依赖 -user&26 people&依赖 -separate sheet&paper&AGGREGATION -user&task&依赖 -user&contrast&依赖 -map aspect&MapReduce&AGGREGATION -her&place& -MapReduce&element& -user&26 box&依赖 -user&single-word page&依赖 -26 box&first letter&依赖 -first letter&word&AGGREGATION -their&pages& -26 box&word&依赖 -user&a box and sort&依赖 -user&stack alphabetically&依赖 -number&reduce aspect&依赖 -number&MapReduce&依赖 -example&reduce aspect&AGGREGATION -number&reduce aspect&依赖 -number&reduce aspect&依赖 -number&MapReduce&依赖 -reduce aspect&MapReduce&AGGREGATION -number&page&AGGREGATION -number&MapReduce&依赖 -broad range&real-world use&AGGREGATION -social networking site&example&依赖 -social networking site&MapReduce&依赖 -users&friends& -historical behavior&user&AGGREGATION -booking website&MapReduce&依赖 -industrial facility&different sensor&依赖 -industrial facility&equipment datum&依赖 -industrial facility&installation&依赖 -Many business&capital and overhead&依赖 -Hadoop and MapReduce&enormous scalability&依赖 -organization&public cloud service&依赖 -organization&result&依赖 -Hadoop and MapReduce&minimal capital cost&依赖 -organization&Hadoop and MapReduce&依赖 -its&offering& -HDInsight service&provision Hadoop&依赖 -its&service& -Microsoft Azure&HDInsight service&依赖 -HDInsight service&user&依赖 -Hadoop and MapReduce&one option&依赖 -Hadoop and MapReduce&organization&依赖 -Spark and Hadoop cluster&private , on-premises big data infrastructure&依赖 -Organizations&Apache Spark&依赖 -Organizations&other platform&依赖 -big data framework&type&依赖 -big data framework&processing task&依赖 -type&processing task&AGGREGATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt deleted file mode 100644 index 0722022c07ae5a5942421179c6fbc7fbdd33e1a6..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt +++ /dev/null @@ -1,39 +0,0 @@ -MapReduce -Stephen J. Bigelow -By -Stephen J. Bigelow, Senior Technology Editor -MapReduce is a core component of the Apache Hadoop software framework. - -Hadoop enables resilient, distributed processing of massive unstructured data sets across commodity computer clusters, in which each node of the cluster includes its own storage. MapReduce serves two essential functions: it filters and parcels out work to various nodes within the cluster or map, a function sometimes referred to as the mapper, and it organizes and reduces the results from each node into a cohesive answer to a query, referred to as the reducer. - -How MapReduce works -The original version of MapReduce involved several component daemons, including: - -JobTracker -- the master node that manages all the jobs and resources in a cluster; -TaskTrackers -- agents deployed to each machine in the cluster to run the map and reduce tasks; and -JobHistory Server -- a component that tracks completed jobs and is typically deployed as a separate function or with JobTracker. -With the introduction of MapReduce and Hadoop version 2, previous JobTracker and TaskTracker daemons have been replaced with components of Yet Another Resource Negotiator (YARN), called ResourceManager and NodeManager. - -ResourceManager runs on a master node and handles the submission and scheduling of jobs on the cluster. It also monitors jobs and allocates resources. -NodeManager runs on slave nodes and interoperates with Resource Manager to run tasks and track resource usage. NodeManager can employ other daemons to assist with task execution on the slave node. -To distribute input data and collate results, MapReduce operates in parallel across massive cluster sizes. Because cluster size doesn't affect a processing job's final results, jobs can be split across almost any number of servers. Therefore, MapReduce and the overall Hadoop framework simplify software development. - -MapReduce is available in several languages, including C, C++, Java, Ruby, Perl and Python. Programmers can use MapReduce libraries to create tasks without dealing with communication or coordination between nodes. - -MapReduce is also fault-tolerant, with each node periodically reporting its status to a master node. If a node doesn't respond as expected, the master node reassigns that piece of the job to other available nodes in the cluster. This creates resiliency and makes it practical for MapReduce to run on inexpensive commodity servers. - -MapReduce examples and uses -The power of MapReduce is in its ability to tackle huge data sets by distributing processing across many nodes, and then combining or reducing the results of those nodes. - -As a basic example, users could list and count the number of times every word appears in a novel as a single server application, but that is time-consuming. By contrast, users can split the task among 26 people, so each takes a page, writes a word on a separate sheet of paper and takes a new page when they're finished. This is the map aspect of MapReduce. And if a person leaves, another person takes his or her place. This exemplifies MapReduce's fault-tolerant element. - -When all the pages are processed, users sort their single-word pages into 26 boxes, which represent the first letter of each word. Each user takes a box and sorts each word in the stack alphabetically. The number of pages with the same word is an example of the reduce aspect of MapReduce. - -There is a broad range of real-world uses for MapReduce involving complex and seemingly unrelated data sets. For example, a social networking site could use MapReduce to determine users' potential friends, colleagues and other contacts based on site activity, names, locations, employers and many other data elements. A booking website could use MapReduce to examine the search criteria and historical behaviors of users, and can create customized offerings for each. An industrial facility could collect equipment data from different sensors across the installation and use MapReduce to tailor maintenance schedules or predict equipment failures to improve overall uptime and cost-savings. - -MapReduce services and alternatives -One challenge with MapReduce is the infrastructure it requires to run. Many businesses that could benefit from big data tasks can't sustain the capital and overhead needed for such an infrastructure. As a result, some organizations rely on public cloud services for Hadoop and MapReduce, which offer enormous scalability with minimal capital costs or maintenance overhead. - -For example, Amazon Web Services (AWS) provides Hadoop as a service through its Amazon Elastic MapReduce (EMR) offering. Microsoft Azure offers its HDInsight service, which enables users to provision Hadoop, Apache Spark and other clusters for data processing tasks. Google Cloud Platform provides its Cloud Dataproc service to run Spark and Hadoop clusters. - -For organizations that prefer to build and maintain private, on-premises big data infrastructures, Hadoop and MapReduce represent only one option. Organizations can opt to deploy other platforms, such as Apache Spark, High-Performance Computing Cluster and Hydra. The big data framework an enterprise chooses will depend on the types of processing tasks required, supported programming languages, and performance and infrastructure demands. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls deleted file mode 100644 index be8f637081ad7e9390a5aa95e0570957a2922a26..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt deleted file mode 100644 index ad7368a72f861cdf2ab583c4f42cda545e7b5d1d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt +++ /dev/null @@ -1,223 +0,0 @@ -component&Apache Hadoop ecosystem&AGGREGATION -Get start&MapReduce&依赖 -Apache Hadoop ecosystem&massive data processing&依赖 -hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&framework&依赖 -Apache Hadoop ecosystem&Hadoop ecosystem&GENERALIZATION -Get start&hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&依赖 -hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&Apache Hadoop ecosystem&依赖 -Other component&Apache Hadoop&AGGREGATION -MapReduce component&dispersed and parallel algorithm&依赖 -MapReduce component&processing&依赖 -MapReduce component&Hadoop ecosystem&依赖 -MapReduce component&massive datum&依赖 -processing&massive datum&AGGREGATION -understanding&MapReduce&AGGREGATION -MapReduce&real-life application&依赖 -It&reader&依赖 -It&insight&依赖 -vast volume&datum&AGGREGATION -write application&datum&依赖 -write application&large cluster&依赖 -vast amount&datum&AGGREGATION -write application&vast amount&依赖 -Hadoop framework&framework&GENERALIZATION -we&programming model&依赖 -we&computer cluster&依赖 -we&large dataset&依赖 -application&datum&依赖 -enormous volume&datum&AGGREGATION -It&enormous volume&实现 -It&datum&实现 -We&former task&依赖 -We&former task&依赖 -we&chunk&依赖 -input dataset&dataset&GENERALIZATION -we&map job&依赖 -we&input dataset&依赖 -map job&job&GENERALIZATION -Map task&chunk&依赖 -Map task&task&GENERALIZATION -Map task¶llell&依赖 -we&reduce task&依赖 -map&reduce task&依赖 -we&input&依赖 -we&output&依赖 -reducers process&intermediate datum&依赖 -reducers process&map&依赖 -final output&framework&AGGREGATION -reducers process&map&依赖 -reducers process&intermediate datum&依赖 -reducers process&intermediate datum&依赖 -reducers process&map&依赖 -smaller tuple&task&依赖 -reducers process&map&依赖 -reducers process&intermediate datum&依赖 -MapReduce framework&framework&GENERALIZATION -MapReduce framework&task&依赖 -MapReduce framework&scheduling and monitoring&依赖 -scheduling and monitoring&task&AGGREGATION -failed task&framework&依赖 -framework&distributed processing&依赖 -framework&programmer&依赖 -framework&little expertise&依赖 -MapReduce&overview&依赖 -MapReduce&MapReduce Architecture and MapReduce ’s phase&依赖 -overview&MapReduce Architecture and MapReduce ’s phase&AGGREGATION -MapReduce&overview&依赖 -MapReduce&MapReduce Architecture and MapReduce ’s phase&依赖 -diagram&MapReduce architecture&依赖 -MapReduce architecture&various component&依赖 -brief description&understanding&依赖 -brief description&component&AGGREGATION -brief description&works&依赖 -our&understanding& -piece&actual work&AGGREGATION -MapReduce job&job&GENERALIZATION -MapReduce job&many small task&依赖 -task tracker&tracker&GENERALIZATION -tracker&scheduling job&依赖 -tracker&role&依赖 -status&task&AGGREGATION -tracker&task&依赖 -job tracker&tracker&GENERALIZATION -result&mapping and reduce&AGGREGATION -a program or application programming&MapReduce&依赖 -a program or application programming&job&依赖 -MapReduce&job&依赖 -MapReduce&many client&依赖 -division&main job&AGGREGATION -client&job&依赖 -client&job&依赖 -client&MapReduce Master&依赖 -client&MapReduce Master&依赖 -master&job&依赖 -master&equal sub-part&依赖 -job-part&two main task&依赖 -job-part&MapReduce&依赖 -requirement&organization or company&AGGREGATION -developer&logic&依赖 -reducer&output&依赖 -reducer&final output&依赖 -MapReduce program&program&GENERALIZATION -diagram&simplified flow diagram&依赖 -diagram&MapReduce program&实现 -trackers&work&依赖 -job&two key component&依赖 -job&map task&依赖 -map task&role&依赖 -map task&job-part&依赖 -map task&task&GENERALIZATION -map task&splitting job&依赖 -reduce task&role&依赖 -reduce task&shuffling&依赖 -job tracker&act&依赖 -job tracker&master&依赖 -job tracker&master&依赖 -job tracker&act&依赖 -It&job&依赖 -job tracker schedule job&job tracker schedule job&依赖 -It&job&依赖 -task tracker&map task&依赖 -Task tracker&job tracker&依赖 -Task tracker&assigned job&依赖 -Task tracker&status&依赖 -status&assigned job&AGGREGATION -diagram&work&依赖 -MapReduce program&three main phase&依赖 -phase&MapReduce&AGGREGATION -combiner phase&phase&GENERALIZATION -first phase&program&AGGREGATION -Mapping Phase This&program&依赖 -splitting step&step&GENERALIZATION -dataset&equal unit&依赖 -dataset&splitting step&依赖 -splitting step&input split&依赖 -Hadoop&RecordReader&依赖 -splitting step&TextInputFormat&依赖 -key-value pair&mapping step&依赖 -key-value pair&input&依赖 -mapper&that&依赖 -mapping step&logic&依赖 -mapping step&step&GENERALIZATION -output&same form and ( key-value pair&AGGREGATION -mapper&key-value pair&依赖 -mapper&step&依赖 -second phase&completion&依赖 -second phase&Mapping phase&依赖 -completion&Mapping phase&AGGREGATION -Mapping phase&phase&GENERALIZATION -second phase&place&依赖 -It&two main step ###&依赖 -It&two main step&依赖 -shuffling phase&duplicate value&依赖 -shuffling phase&removal&依赖 -removal&duplicate value&AGGREGATION -grouping&value&AGGREGATION -output&phase&AGGREGATION -output&Reducer phase&依赖 -output&reducer phase&依赖 -output&shuffling phase&AGGREGATION -output&input&依赖 -shuffling phase&phase&GENERALIZATION -Reducer phase&phase&GENERALIZATION -reducer&input&依赖 -summary&entire dataset&AGGREGATION -output&hdf&依赖 -diagram&example&依赖 -example&MapReduce&AGGREGATION -diagram&three main phase&依赖 -Example&MapReduce&AGGREGATION -duplicate output&phase&依赖 -duplicate output&single output&依赖 -combiner phase&Shuffling phase&依赖 -Shuffling phase&phase&GENERALIZATION -performance&Jobs&AGGREGATION -four phase&MapReduce&AGGREGATION -benefit&Hadoop MapReduce Speed&AGGREGATION -MapReduce&huge unstructured datum&依赖 -MapReduce&short time&依赖 -MapReduce framework&failure&依赖 -scale-out feature&process or store datum&依赖 -Hadoop&scale-out feature&依赖 -scale-out feature&cost-effective manner&依赖 -scale-out feature&user&依赖 -MapReduce&user&依赖 -MapReduce&application&依赖 -replica&network&依赖 -replica&various node&依赖 -replica&datum&AGGREGATION -event&failure&AGGREGATION -copy&datum&AGGREGATION -multiple job-part&same dataset&AGGREGATION -multiple job-part&MapReduce&依赖 -multiple job-part¶llel manner&依赖 -practical application&MapReduce program&AGGREGATION -hadoop mapreduce&practical application&依赖 -hadoop mapreduce&MapReduce program&依赖 -application&hadoop mapreduce&AGGREGATION -E-commerce E-commerce company&MapReduce&依赖 -E-commerce E-commerce company&MapReduce&依赖 -Social network&certain information&依赖 -Social network&Facebook , Twitter , and LinkedIn&依赖 -Social network&Facebook , Twitter , and LinkedIn&依赖 -Social network&certain information&依赖 -Social network&social media platform&依赖 -Social network&social media platform&依赖 -who&status&依赖 -It&important information&依赖 -your&status& -who&profile&依赖 -It&status&依赖 -your&profile& -Entertainment Netflix&MapReduce&依赖 -clicks and log&online customer&AGGREGATION -customer&interests and behavior&依赖 -information&movie&依赖 -crucial processing component&Hadoop framework&AGGREGATION -Conclusion MapReduce&MapReduce&GENERALIZATION -Conclusion MapReduce&Hadoop framework&依赖 -quick , scalable , and cost-effective program&huge datum&依赖 -It&quick , scalable , and cost-effective program&依赖 -company&framework&依赖 -company&framework&依赖 -their&strategies& diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt deleted file mode 100644 index 43986d403ef2ae475090aea4195357c012ddb377..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt +++ /dev/null @@ -1,178 +0,0 @@ -select_all Edge AppSpace -SolutionHub -Performance / CDN -Security -Virtual Waiting Room -A/B Testing -Search AppSpace -AppStack -Node.js Edge Hosting -RunStack -Containers -Serverless -gps_fixed Core Platform -Section Control Plane -Edge AppSpace -Adaptive Edge Engine (AEE) -Global Edge Network -Solutions -SaaS -PaaS & Hosting Providers -Edge App Hosting -Docs -Resources -Blog -Case Studies -Edge Content Library -Solution Briefs -Product Videos -Engineering Education -About Section -Partners -Changelog -Pricing -Contact Log In Get Started -Platform -select_allEdge AppSpace -SolutionHub -Performance / CDN -Security -Virtual Waiting Room -A/B Testing -AppStack -Node.js Edge Hosting -RunStack -Containers -Serverless -Search AppSpace -gps_fixedCore Platform -Section Control Plane -Edge AppSpace -Adaptive Edge Engine (AEE) -Global Edge Network -Docs -Resources -Blog -Case Studies -Content Library -Solution Briefs -Changelog -Engineering Education -Partners -About Section -Pricing -Contact -Log In -Get Started -Understanding MapReduce in Hadoop -December 6, 2020 -FacebookTwitterLinkedIn -MapReduce is a component of the Apache Hadoop ecosystem, a framework that enhances massive data processing. Other components of Apache Hadoop include Hadoop Distributed File System (HDFS), Yarn, and Apache Pig. - -The MapReduce component enhances the processing of massive data using dispersed and parallel algorithms in the Hadoop ecosystem. This programming model is applied in social platforms and e-commerce to analyze huge data collected from online users. - -This article provides an understanding of MapReduce in Hadoop. It will enable readers to gain insights on how vast volumes of data is simplified and how MapReduce is used in real-life applications. - -Introduction to MapReduce in Hadoop -MapReduce is a Hadoop framework used for writing applications that can process vast amounts of data on large clusters. It can also be called a programming model in which we can process large datasets across computer clusters. This application allows data to be stored in a distributed form. It simplifies enormous volumes of data and large scale computing. - -There are two primary tasks in MapReduce: map and reduce. We perform the former task before the latter. In the map job, we split the input dataset into chunks. Map task processes these chunks in parallell. The map we use outputs as inputs for the reduce tasks. Reducers process the intermediate data from the maps into smaller tuples, that reduces the tasks, leading to the final output of the framework. - -The MapReduce framework enhances the scheduling and monitoring of tasks. The failed tasks are re-executed by the framework. This framework can be used easily, even by programmers with little expertise in distributed processing. MapReduce can be implemented using various programming languages such as Java, Hive, Pig, Scala, and Python. - -How MapReduce in Hadoop works -An overview of MapReduce Architecture and MapReduce’s phases will help us understand how MapReduce in Hadoop works. - -MapReduce architecture -The following diagram shows a MapReduce architecture. - -MapReduce Architecture - -Image Source: A4Academics - -MapReduce architecture consists of various components. A brief description of these components can improve our understanding on how MapReduce works. - -Job: This is the actual work that needs to be executed or processed -Task: This is a piece of the actual work that needs to be executed or processed. A MapReduce job comprises many small tasks that need to be executed. -Job Tracker: This tracker plays the role of scheduling jobs and tracking all jobs assigned to the task tracker. -Task Tracker: This tracker plays the role of tracking tasks and reporting the status of tasks to the job tracker. -Input data: This is the data used to process in the mapping phase. -Output data: This is the result of mapping and reducing. -Client: This is a program or Application Programming Interface (API) that submits jobs to the MapReduce. MapReduce can accept jobs from many clients. -Hadoop MapReduce Master: This plays the role of dividing jobs into job-parts. -Job-parts: These are sub-jobs that result from the division of the main job. -In the MapReduce architecture, clients submit jobs to the MapReduce Master. This master will then sub-divide the job into equal sub-parts. The job-parts will be used for the two main tasks in MapReduce: mapping and reducing. - -The developer will write logic that satisfies the requirements of the organization or company. The input data will be split and mapped. - -The intermediate data will then be sorted and merged. The reducer that will generate a final output stored in the HDFS will process the resulting output. - -The following diagram shows a simplified flow diagram for the MapReduce program. - -MapReduce Flow Diagram - -Image Source: Data Flair - -How job trackers and task trackers work -Every job consists of two key components: mapping task and reducing task. The map task plays the role of splitting jobs into job-parts and mapping intermediate data. The reduce task plays the role of shuffling and reducing intermediate data into smaller units. - -The job tracker acts as a master. It ensures that we execute all jobs. The job tracker schedules jobs that have been submitted by clients. It will assign jobs to task trackers. Each task tracker consists of a map task and reduces the task. Task trackers report the status of each assigned job to the job tracker. The following diagram summarizes how job trackers and task trackers work. - -Job Trackers and Task Trackers - -Image Source: CNBlogs - -Phases of MapReduce -The MapReduce program is executed in three main phases: mapping, shuffling, and reducing. There is also an optional phase known as the combiner phase. - -Mapping Phase -This is the first phase of the program. There are two steps in this phase: splitting and mapping. A dataset is split into equal units called chunks (input splits) in the splitting step. Hadoop consists of a RecordReader that uses TextInputFormat to transform input splits into key-value pairs. - -The key-value pairs are then used as inputs in the mapping step. This is the only data format that a mapper can read or understand. The mapping step contains a coding logic that is applied to these data blocks. In this step, the mapper processes the key-value pairs and produces an output of the same form (key-value pairs). - -Shuffling phase -This is the second phase that takes place after the completion of the Mapping phase. It consists of two main steps: sorting and merging. In the sorting step, the key-value pairs are sorted using the keys. Merging ensures that key-value pairs are combined. - -The shuffling phase facilitates the removal of duplicate values and the grouping of values. Different values with similar keys are grouped. The output of this phase will be keys and values, just like in the Mapping phase. - -Reducer phase -In the reducer phase, the output of the shuffling phase is used as the input. The reducer processes this input further to reduce the intermediate values into smaller values. It provides a summary of the entire dataset. The output from this phase is stored in the HDFS. - -The following diagram shows an example of a MapReduce with the three main phases. Splitting is often included in the mapping stage. - -Example of MapReduce - -Image Source: Edureka - -Combiner phase -This is an optional phase that’s used for optimizing the MapReduce process. It’s used for reducing the pap outputs at the node level. In this phase, duplicate outputs from the map outputs can be combined into a single output. The combiner phase increases speed in the Shuffling phase by improving the performance of Jobs. - -The following diagram shows how all the four phases of MapReduce have been applied. - -MapReduce with Combiner Phase - -Image Source: Cloud Front - -Benefits of Hadoop MapReduce -Speed: MapReduce can process huge unstructured data in a short time. -Fault-tolerance: The MapReduce framework can handle failures. -Cost-effective: Hadoop has a scale-out feature that enables users to process or store data in a cost-effective manner. -Scalability: Hadoop provides a highly scalable framework. MapReduce allows users to run applications from many nodes. -Data availability: Replicas of data are sent to various nodes within the network. This ensures copies of the data are available in the event of failure. -Parallel Processing: In MapReduce, multiple job-parts of the same dataset can be processed in a parallel manner. This reduces the time taken to complete a task. -Applications of Hadoop MapReduce -The following are some of the practical applications of the MapReduce program. - -E-commerce -E-commerce companies such as Walmart, E-Bay, and Amazon use MapReduce to analyze buying behavior. MapReduce provides meaningful information that is used as the basis for developing product recommendations. Some of the information used include site records, e-commerce catalogs, purchase history, and interaction logs. - -Social networks -The MapReduce programming tool can evaluate certain information on social media platforms such as Facebook, Twitter, and LinkedIn. It can evaluate important information such as who liked your status and who viewed your profile. - -Entertainment -Netflix uses MapReduce to analyze the clicks and logs of online customers. This information helps the company suggest movies based on customers’ interests and behavior. - -Conclusion -MapReduce is a crucial processing component of the Hadoop framework. It’s a quick, scalable, and cost-effective program that can help data analysts and developers process huge data. - -This programming model is a suitable tool for analyzing usage patterns on websites and e-commerce platforms. Companies providing online services can utilize this framework to improve their marketing strategies. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls deleted file mode 100644 index a5b6491de17c059764cf9ae23f2108a1ecd86f2c..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt deleted file mode 100644 index b770a957054fafed7d850f2371eade4b589219f1..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt +++ /dev/null @@ -1,123 +0,0 @@ -enormous measure&datum&AGGREGATION -Reduce task&datum&依赖 -mapping and splitting&datum&AGGREGATION -Our&Programs& -you&privacy policy&依赖 -our&policy& -whatsapp & other means&communication&AGGREGATION -project&MapReduce&AGGREGATION -enormous scope&data examination&AGGREGATION -MapReduce Architecture Components&MapReduce Architecture 1&AGGREGATION -two significant part&Hadoop&AGGREGATION -MAPREDUCE ARCHITECTURE HDFS and MapReduce architecture&Hadoop&依赖 -MapReduce task&2 phase&依赖 -MapReduce&many programming language&依赖 -MapReduce&different diverse various improvement&依赖 -Map Phase Reduce&library&依赖 -map every one&position&AGGREGATION -motivation&position&依赖 -motivation&map every one&依赖 -motivation&map every one&依赖 -motivation&position&依赖 -it&comparable undertaking&依赖 -it&it&依赖 -motivation&map every one&依赖 -motivation&position&依赖 -who&Job&依赖 -who&MapReduce&依赖 -MAPREDUCE ARCHITECTURE Components&MapReduce Architecture&AGGREGATION -numerous client&work&依赖 -Hadoop MapReduce Master&position part&依赖 -Hadoop MapReduce Master&specific occupation&依赖 -aftereffect&last yield&依赖 -aftereffect&work part&AGGREGATION -end-product&preparation&依赖 -we&MapReduce Architecture&依赖 -we&client&依赖 -client&specific size&依赖 -client&Hadoop MapReduce Master&依赖 -job&specific size&AGGREGATION -client&job&依赖 -MapReduce expert&additional identical job part&依赖 -MapReduce expert&job&依赖 -Map&programming&依赖 -Map&necessity&依赖 -necessity&utilization case&AGGREGATION -Map&tackling&依赖 -engineer&rationale&依赖 -business&that&依赖 -their&rationale& -we&which&依赖 -Map&moderate key-esteem pair&依赖 -Map&yield&依赖 -its&yield& -output&Map&AGGREGATION -last yield&Hadoop Distributed File System&依赖 -last yield&hdf&依赖 -n number&MapReduce assignment&AGGREGATION -calculation&upgraded way&依赖 -calculation&extent&依赖 -calculation&least&依赖 -we&MapReduce phase&依赖 -its&architecture& -comprehension&architecture&AGGREGATION -MapReduce Architecture&example , Map phase and Reduce phase&依赖 -MapReduce Architecture&two phase&依赖 -its&use& -principle use&key-esteem set&依赖 -principle use&input datum&依赖 -sort&address&AGGREGATION -id&sort&AGGREGATION -key&address&依赖 -key&sort&依赖 -Map ( ) capacity&memory vault&依赖 -its&vault& -Map ( ) capacity&input key-esteem pair&依赖 -one&input key-esteem pair&AGGREGATION -Map ( ) capacity&one&依赖 -middle&key-esteem&AGGREGATION -reducer total&data-dependent&依赖 -reducer total&key-esteem pair&依赖 -its&pair& -reducer total&key-esteem pair&依赖 -reducer total&key-esteem pair&依赖 -reducer total&key-esteem pair&依赖 -reducer total&data-dependent&依赖 -reducer total&data-dependent&依赖 -reducer total&key-esteem pair&依赖 -reducer total&data-dependent&依赖 -reducer total&key-esteem pair&依赖 -reducer total&data-dependent&依赖 -reducer total&key-esteem pair&依赖 -reducer total&data-dependent&依赖 -reducer total&data-dependent&依赖 -reducer total&key-esteem pair&依赖 -reducer total&key-esteem pair&依赖 -reducer total&data-dependent&依赖 -reducer total&data-dependent&依赖 -Task Tracker&MapReduce Architecture&依赖 -Task Tracker&Task Tracker&依赖 -It&real slave&依赖 -one&MapReduce task&依赖 -Task Tracker&one&依赖 -Task Tracker&node&依赖 -one&node&AGGREGATION -additionally one significant segment&MapReduce Architecture&AGGREGATION -plan&Hadoop&AGGREGATION -plan&different objective&依赖 -different objective&it&依赖 -different objective&you&依赖 -hadoop mapreduce architecture diagram&hadoop mapreduce architecture diagram&依赖 -Hadoop MapReduce framework architecture&three significant layer&依赖 -MapReduce Architecture system&monstrous information&依赖 -MapReduce Architecture system&mind-boggling interaction&依赖 -other supporting square&Hadoop&AGGREGATION -you&information examiner&依赖 -you&well-known programming language&依赖 -you&profession&依赖 -you&data science field&依赖 -you&point&依赖 -their&jobs& -Academy&Program& -’s postgraduate certificate program&Cloud Computing&依赖 -’s postgraduate certificate program&Cloud aspirant&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt deleted file mode 100644 index 62946b28397f9d778aefc24118fb724b5c6c1c1d..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt +++ /dev/null @@ -1,73 +0,0 @@ -INTRODUCTION -MapReduce Architecture is a programming model and a software framework utilized for preparing enormous measures of data. MapReduce program works in two stages, to be specific, Map and Reduce. Map requests that arrange with mapping and splitting of data while Reduce tasks reduce and shuffle the data. - -Want To Know More About Our Programs? -Name -Email - -IN  |India (+91) -+91 -Phone - -City - -Course Interested In -By proceeding, you agree to our privacy policy and also agree to receive information from Jigsaw Academy through WhatsApp & other means of communication. - -Submit -Hadoop MapReduce Architecture is fit for running MapReduce programs written in different languages: C, Python, Ruby, and Java. The projects of MapReduce in cloud computing are equal, accordingly help to perform an enormous scope of data examination utilizing various machines in the cluster. - -MapReduce Architecture -Components of MapReduce Architecture -1. MAPREDUCE ARCHITECTURE -HDFS and MapReduce architecture are the two significant parts of Hadoop that make it so efficient and powerful to utilize. MapReduce is a programming model utilized for proficient handling in equal over huge data collections in a conveyed way. The data is first to part and afterward consolidated to deliver the eventual outcome. - -The MapReduce task is predominantly isolated into 2 phases: - -Map Phase -Reduce Phase -The libraries for MapReduce are written in so many programming languages with different diverse various improvements. The motivation behind Map Reduce in Hadoop is to Map every one of the positions, and afterward, it will decrease it to comparable undertakings for giving less overhead over the cluster network and to diminish the preparing power. - -2. COMPONENTS OF MAPREDUCE ARCHITECTURE -Components of MapReduce Architecture are: -Client -Job -Hadoop MapReduce Master -Job Parts -Input Data -Output Data -Client: The MapReduce client is the person who carries the Job to the MapReduce for preparing. There can be numerous clients accessible that persistently send works for preparing to the Hadoop MapReduce Manager. -Job: The MapReduce Job is the real work that the customer needed to do which is included such countless more modest errands that the customer needs to execute or process. -Hadoop MapReduce Master: It separates the specific occupation into resulting position parts. -Job Parts: The sub-jobs or tasks that are acquired in the wake of isolating the primary work. The aftereffect of all the work parts joined to deliver the last yield. -Input Data: The data index that is taken care of to the MapReduce for handling. -Output Data: The end-product is acquired after preparation. -In MapReduce Architecture, we have a client. The client will present the job of a specific size to the Hadoop MapReduce Master. Presently, the MapReduce expert will isolate this job into additional identical job parts. These job parts are then made accessible for the MapReduce Task. - -Map Reduce programming according to the necessity of the utilization case that the specific organization is tackling. The engineer composes their rationale to satisfy the prerequisite that the business requires. The input which we are utilizing is then taken care of to the Map Task, and the Map will produce moderate key-esteem pair as its yield. The output of Map, for example, these key-esteem sets, are then taken care of to the Reducer, and the last yield is put away on the HDFS- Hadoop Distributed File System. - -There can be n number of MapReduce assignments made accessible for preparing the information according to the prerequisite. The calculation for MapReduce is made in an exceptionally upgraded way to such an extent that the time intricacy or space intricacy is least. - -How about we examine the MapReduce phases to improve comprehension of its architecture: - -MapReduce Architecture is fundamentally partitioned into two phases, for example, Map phase and Reduce phase. - -Map: As the name proposes, its principle use is to plan the input data in key-esteem sets. The contribution to the map might be a key-esteem pair where the key can be the id of some sort of address, and worth is the real value that it keeps. The Map () capacity will be executed in its memory vault on every one of these input key-esteem pairs and creates the moderate key-esteem pair, which fills in as a contribution for the Reducer or Reduce () work. -Reduce: The middle of the key-esteem combines that fill in as contribution for Reducer are send and sort and shuffled off the Reduce () work. Reducer total or gathering the data-dependent on its key-esteem pair according to the reducer calculation composed by the developer. -How Task Tracker and the Job tracker manage MapReduce Architecture: -Task Tracker: It can be considered as the real slaves that are dealing with the guidance given by the Job Tracker. This Task Tracker is conveyed on every one of the nodes accessible in the cluster that executes the MapReduce task as taught by Job Tracker. -Job Tracker: It is to deal with all the jobs and all the resources across the cluster and to plan each guide on the Task Tracker running on a similar information hub since there can be many data nodes accessible in the cluster. -There is additionally one significant segment of MapReduce Architecture known as Job History Server. The Job History Server is a daemon cycle that recoveries and stores authentic data about the application or task, similar to the logs which are created after or during the work execution are put away on Job History Server. - -Hadoop MapReduce architecture presently has become a famous solution for the present world necessities. The plan of Hadoop remembers different objectives. Hadoop MapReduce architecture diagram that encourages you to comprehend it better. - -Hadoop MapReduce framework architecture includes three significant layers. They are: -HDFS- Hadoop Distributed File System: NameNode and DataNode, Block in HDFS, and Replication Management. -Yarn: Scheduler, and Application Manager. -MapReduce: Map Task, and Reduce Task. -CONCLUSION -The MapReduce Architecture system works on the mind-boggling interaction of preparing monstrous information that is accessible in the Hadoop structure. There have been numerous critical changes in the MapReduce programming language. - -Hadoop is quite possibly the most well-known system to handle huge information, and extraordinary compared to other supporting squares of Hadoop is MapReduce. If you are looking for a profession as an information examiner in the data science field, at that point, you should know about this rising and well-known programming language. - -Jigsaw Academy’s Postgraduate Certificate Program In Cloud Computing brings Cloud aspirants closer to their dream jobs. The joint-certification course is 6 months long and is conducted online and will help you become a complete Cloud Professional. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls deleted file mode 100644 index 8c8c99abc15e4929bc8851db6d5d69970f488603..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt deleted file mode 100644 index ffe9e69078f112a2ae5aac6652660a4fe3ac62dd..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt +++ /dev/null @@ -1,7 +0,0 @@ -component&MapReduce JobTracker&AGGREGATION -master&job&依赖 -main component&mapreduce job tracker job tracker&AGGREGATION -JobTracker&TaskTrackers&依赖 -JobTracker&job&依赖 -status&task&AGGREGATION -TaskTracker&task&依赖 diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt deleted file mode 100644 index 92be416b47c9d2759c4a919055a030be99e4a497..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt +++ /dev/null @@ -1,6 +0,0 @@ -What are the components of MapReduce -JobTracker and TaskTracker are the main components of the mapreduce -Job Tracker -Job Tracker is a master which creates and runs the job. JobTracker that runs on name node, allocates the job to TaskTrackers. -TaskTracker -TaskTracker is a slave and runs on data node. TaskTracker runs the tasks and reports the status of task to JobTracker. \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls deleted file mode 100644 index 098b139dbcddcf69f1a0a9272e7a122d3f3f9eb1..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt deleted file mode 100644 index 93e85145312b8c8a0fea6b3702d314e2ffac723b..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt +++ /dev/null @@ -1,520 +0,0 @@ -each a set&key/value pair&AGGREGATION -mapreduce & hadoop ii mingshen sun chinese university&hong kong mssun@cse.cuhk.edu.hk mingshen sun ( cuhk ) mapreduce & hadoop outline • mapreduce recap • design pattern&AGGREGATION -set&value&AGGREGATION -set&intermediate key/value pairs • Reduce&AGGREGATION -number&partition&AGGREGATION -value&reducer 3 mingshen sun ( cuhk ) mapreduce & hadoop mapreduce recap • optional&依赖 -set&output value • MapReduce framework guarantee&AGGREGATION -v2 ) •&intermediate key/value pairs • Reduce&依赖 -v2 ) •&set&依赖 -simple hash&key and e.g.&AGGREGATION -v3 ) •&set&依赖 -v3 ) •&output value • MapReduce framework guarantee&依赖 -v2 ’ ) • mini-reducer&later 4 mingshen sun ( cuhk ) mapreduce & hadoop mapreduce recap 5 30 chapter 2&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&MapReduce&AGGREGATION -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 -we&multitude&依赖 -a 1 5 b 2 7 c 2 9 8 p p p p reducer reducer reducer x&7 z 9 figure 2.4&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 -multitude&algorithm&AGGREGATION -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 -we&multitude&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 -we&algorithm&依赖 -we&algorithm&依赖 -7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 -all term t 2 doc d&4&依赖 -[ c1&] ) 3&依赖 -[ c1&] ) 3&依赖 -[ c1&] ) 3&依赖 -[ c1&sum 0 4&依赖 -[ c1&sum 0 4&依赖 -[ c1&sum 0 4&依赖 -[ c1&] ) 3&依赖 -[ c1&sum 0 4&依赖 -] do 5&] do 5&依赖 -sum sum + c 6&basic word count algorithm&依赖 -sum sum + c 6&basic word count algorithm&依赖 -sum sum + c 6&basic word count algorithm&依赖 -sum sum + c 6&basic word count algorithm&依赖 -amount&intermediate datum&AGGREGATION -Section 2.4&output&依赖 -output&mapper&AGGREGATION -Section 2.4&mapper&依赖 -1 • in-mapper combine •&unique term&依赖 -1 • in-mapper combine •&key-value pair&依赖 -1 • in-mapper combine •&document 17 3.1&依赖 -term t 2 doc d&5&依赖 -Emit&entire document 6&依赖 -Emit&entire document 6&依赖 -Emit&entire document 6&依赖 -Emit&all term t&依赖 -Emit&all term t&依赖 -Emit&entire document 6&依赖 -2 h&7&依赖 -Emit&all term t&依赖 -Emit&all term t&依赖 -workings&detail&依赖 -workings&detail&依赖 -workings&algorithm critically&AGGREGATION -block&input count entire document mingshen sun ( cuhk ) mapreduce & hadoop word count&AGGREGATION -term t 2 doc d&6&依赖 -2 h&{ t } ) figure 3.3&依赖 -improved MapReduce word count algorithm&“ in-mapper combine ” design pattern&依赖 -Reducer&Figure 3.1&依赖 -Hadoop&example&依赖 -it&all&依赖 -Hadoop&guarantee&依赖 -execution framework&it&依赖 -execution framework&option&依赖 -In-Mapper Combiners • advantage&in-mapper combiner&AGGREGATION -local aggregation&place&依赖 -semantics&MapReduce&依赖 -semantics&contrast&依赖 -semantics&default combiner&AGGREGATION -Default combiner&map output&依赖 -• state&> potentially large memory overhead&依赖 -• state&mapper&依赖 -• algorithmic behavior&> potential order-dependent bug&依赖 -input keyvalue pair&> potential order-dependent bug&依赖 -input keyvalue pair&order&依赖 -• algorithmic behavior&order&依赖 -• with/without combiner&algorithm correctness •&依赖 -you&Java&依赖 -algorithm correctness •&0 , 1&依赖 -you&combiner class&依赖 -version 1 •&drawback&依赖 -•&reducer&依赖 -• i.e.&set combiner class&依赖 -• i.e.&reducer class 21 3.1&依赖 -r1 , r2&cnt 0 5&依赖 -r1 , r2&sum 0 4&依赖 -r1 , r2&] ) 3&依赖 -r1 , r2&cnt 0 5&依赖 -r1 , r2&] ) 3&依赖 -r1 , r2&sum 0 4&依赖 -mean&value&AGGREGATION -basic MapReduce algorithm&mean&依赖 -]&6&依赖 -MapReduce algorithm&algorithm&GENERALIZATION -]&basic MapReduce algorithm&依赖 -basic MapReduce algorithm&value&依赖 -value&mean&依赖 -value&mean&依赖 -value&value&依赖 -value&value&依赖 -mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 -mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 -mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 -we&concrete example&依赖 -mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 -mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 -Version 1 • Mean&means&AGGREGATION -it&problem&依赖 -It&a problem&依赖 -It&Word Count problem&依赖 -optimization&algorithm 23 48 chapter 3&依赖 -correctness&algorithm 23 48 chapter 3&AGGREGATION -r 7&Emit&依赖 -r 7&1 8&依赖 -r 7&1 8&依赖 -r 7&( string t&依赖 -r 7&Emit&依赖 -r 7&pair ( sum&依赖 -r 7&pair ( sum&依赖 -r 7&( string t&依赖 -sum sum +&incorrect first attempt&依赖 -sum sum +&combiner&依赖 -mismatch&MapReduce programming model&依赖 -MapReduce programming model&programming model&GENERALIZATION -mismatch&MapReduce programming model&依赖 -sum sum +&value&依赖 -sum sum +&mean&依赖 -mismatch&MapReduce programming model&依赖 -We&complex key and value&依赖 -optimization&correctness&依赖 -correctness&algorithm&AGGREGATION -restriction&programming model&依赖 -optimization&algorithm&依赖 -combiner&value&依赖 -combiner&integer&依赖 -combiner&list&依赖 -output value type&reducer&AGGREGATION -list&integer&AGGREGATION -list&pair&AGGREGATION -it&mingshen sun ( cuhk ) mapreduce & hadoop computing&依赖 -it&Version 3 •&依赖 -cnt cnt + c 8&Mean&依赖 -cnt cnt + c 8&Mean&依赖 -cnt cnt + c 8&Mean&依赖 -cnt cnt + c 8&Mean&依赖 -integer ravg ) figure 3.6&value&依赖 -cnt cnt + c 8&Mean&依赖 -integer ravg ) figure 3.6&mean&依赖 -integer ravg ) figure 3.6&mingshen sun ( cuhk ) mapreduce & hadoop computing&依赖 -•&combiner&依赖 -25 50 chapter 3&25 50 chapter 3&依赖 -MapReduce algorithm&value&依赖 -MapReduce algorithm&mean&依赖 -2&10&依赖 -reducer&Figure 3.6 and one&依赖 -reducer&correct sum and count&依赖 -combiner&aggregate partial sum&依赖 -they&many time&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 -input key-value type&reducer&AGGREGATION -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 -performance&computation&AGGREGATION -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 -output key-value type&combiner&AGGREGATION -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 -reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 -• m = n * n matrix ( n = number&unique word&AGGREGATION -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -[ j ] = number&time word w&AGGREGATION -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 -distributional profile&word&AGGREGATION -we&whole matrix&实现 -billion&word&AGGREGATION -single machine&MapReduce&依赖 -single machine&whole matrix •&依赖 -each co-occur word pair and integer 1 •&use&依赖 -•&aggregate partial count&依赖 -default combiner&combiner&GENERALIZATION -each co-occur word pair and integer 1 •&reducer&依赖 -• mapper&partial count&依赖 -28 mingshen sun ( cuhk ) mapreduce & hadoop pairs&approach&依赖 -28 mingshen sun ( cuhk ) mapreduce & hadoop pairs&approach&依赖 -we&default combiner&依赖 -Our&approach& -term w 2 doc d&4&依赖 -2 neighbor and ( w&5&依赖 -[ c1&s 0 4&依赖 -[ c1&s 0 4&依赖 -[ c1&s 0 4&依赖 -[ c1&s 0 4&依赖 -Sum co-occurrence&figure 3.8 ( pair p&依赖 -Sum co-occurrence&count s&依赖 -Sum co-occurrence&( pair p&依赖 -h1 , h2 , h3&Hf new AssociativeArray 4&依赖 -h1 , h2 , h3&] ) 3&依赖 -h1 , h2 , h3&] ) 3&依赖 -h1 , h2 , h3&Hf new AssociativeArray 4&依赖 -] do 5&sum ( hf&依赖 -huge • both pair and stripe&in-mapper combine 34 mingshen sun ( cuhk ) mapreduce & hadoop pairs v.s.&依赖 -better use&combiners • Con&AGGREGATION -Memory size&associative array&AGGREGATION -) percentage ( second&approach figure 3.10&依赖 -) percentage ( second&stripe&依赖 -) percentage ( second&stripe&依赖 -time&” algorithm&AGGREGATION -) percentage ( second&" approach " pair&依赖 -) percentage ( second&" approach " pair&依赖 -) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 -) percentage ( second&stripe&依赖 -) percentage ( second&stripe&依赖 -) percentage ( second&approach figure 3.10&依赖 -) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 -approach figure 3.10&apw corpus r2 = 0.992 r2 = 0.999&AGGREGATION -) percentage ( second&approach figure 3.10&依赖 -) percentage ( second&" approach " pair&依赖 -word cooccurrence matrix&APW corpus&AGGREGATION -di ↵ erent fraction&APW corpus&AGGREGATION -) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 -) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 -) percentage ( second&approach figure 3.10&依赖 -) percentage ( second&" approach " pair&依赖 -experiment&19 slave and ###&依赖 -experiment&Hadoop cluster&依赖 -experiment&19 slave and&依赖 -“&” • estimate relative frequency&依赖 -“&counts •&依赖 -we&MapReduce&依赖 -we&problem&依赖 -drawback&co-occurrence count&AGGREGATION -counts •&•&GENERALIZATION -other&itself&依赖 -36 Relative Frequencies Drawback&co-occurrence count&AGGREGATION -good ”&“ hello world ” estimate relative frequency&依赖 -good ”&MapReduce&依赖 -we&problem&依赖 -31&count& -use&partitioner&GENERALIZATION -sure same word&same reducer ( use and null&依赖 -sure same word&partitioner&依赖 -) • reducer&state&依赖 -order&key&AGGREGATION -• MapReduce&key&依赖 -• MapReduce&MapReduce&GENERALIZATION -• MapReduce&you&依赖 -• MapReduce&sort 39 mingshen sun ( cuhk ) mapreduce & hadoop order inversion&依赖 -• MapReduce&order&依赖 -• MapReduce&Idea •&依赖 -design pattern&order inversion&AGGREGATION -Idea •&•&GENERALIZATION -what&value&依赖 -individual count&same reducer • Preserve state&依赖 -sort order&intermediate key&AGGREGATION -computation&marginal • Control&AGGREGATION -individual count&reducer&依赖 -reading&reducer • buffer value&依赖 -• naive solution •&v ) •&依赖 -• naive solution •&> ( t&依赖 -reading&memory&依赖 -reading&id&依赖 -• naive solution •&• id&依赖 -• sensors record temperature&temperature v ) 41 mingshen sun ( cuhk ) mapreduce & hadoop secondary&依赖 -• sensors record temperature&temperature v ) 41 mingshen sun ( cuhk ) mapreduce & hadoop secondary&依赖 -reading&sensor id&AGGREGATION -•&processing • anything&依赖 -•&multiple key-value pair&依赖 -•&state&依赖 -• value-to-key conversion •&• ( id&依赖 -sorting&43 mingshen sun ( cuhk ) mapreduce & hadoop tools&依赖 -in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 -in-mapper combine • ram vs. disk&• main idea&依赖 -in-mapper combine • ram vs. disk&• main idea&依赖 -reducers process key&• Control order&依赖 -key&local aggregation&依赖 -reducer process&which&依赖 -sorting&Synchronization • Cleverly-constructed data structure&依赖 -44 mingshen sun ( cuhk ) mapreduce & hadoop issues and tradeoffs • number&key-value pair&AGGREGATION -reducers process key&Scale • Works&依赖 -sorting&reducer ( 43 mingshen sun ( cuhk ) mapreduce & hadoop tools&依赖 -in-mapper combine • ram vs. disk&• main idea&依赖 -reducers process key&small dataset&依赖 -in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 -reducers process key&Partitioner • Control&依赖 -in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 -reducer process&a big difference • combiner&依赖 -reducers process key&Partitioner • Control&依赖 -in-mapper combine • ram vs. disk&• main idea&依赖 -key&local aggregation&依赖 -data together • Sort order&intermediate key&AGGREGATION -in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 -network • Size&each key-value pair • de/serialization overhead&AGGREGATION -• Memory management issue&mangled input record&依赖 -• Memory management issue&mangled input record&依赖 -list&posting&AGGREGATION -• Each term&posting&依赖 -• Each term&list&依赖 -document id&id&GENERALIZATION -• Each post&document id&依赖 -INVERTED INDEXING&1 4 11 19 figure 4.1&依赖 -INVERTED INDEXING&1 4 11 19 figure 4.1&依赖 -Simple illustration&inverted index&AGGREGATION -term&list posting&依赖 -postings list&list&GENERALIZATION -front&postings list&AGGREGATION -auxiliary data structure&integer document id&依赖 -auxiliary data structure&mapping&依赖 -retrieval&postings list&依赖 -large •&MapReduce&依赖 -large •&MapReduce&依赖 -large •&MapReduce&依赖 -“&” )&AGGREGATION -54 mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation •&goal&依赖 -Our&goal& -54 mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation •&construct&依赖 -• actual document content ( value&What&依赖 -• Analyze&each document and extract useful term&依赖 -reducer&What&依赖 -reducer&term&依赖 -reducer&• Aggregates all observed posting&依赖 -2 H&posting p ) mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation 57 4.4&依赖 -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -implementation 77 one fish&inverted indexing&依赖 -implementation 77 one fish&aggregate value&依赖 -implementation 77 one fish&aggregate value&依赖 -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -input&view •&依赖 -view •&•&GENERALIZATION -input&’s point&依赖 -posting&term •&AGGREGATION -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -implementation 77 one fish&inverted indexing&依赖 -• Reducer&list (&依赖 -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&aggregate value&依赖 -• Reducer&Reducer&GENERALIZATION -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -reducer&point& -list&term&AGGREGATION -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -input&term&依赖 -implementation 77 one fish&inverted indexing&依赖 -implementation 77 one fish&aggregate value&依赖 -implementation 77 one fish&aggregate value&依赖 -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&inverted indexing&依赖 -input&’s point&依赖 -• Reducer&need )&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -implementation 77 one fish&aggregate value&依赖 -input&term&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -1 red d2 1 d3 1 d3 1 simple illustration&baseline inverted indexing algorithm&AGGREGATION -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -input&view •&依赖 -implementation 77 one fish&inverted indexing&依赖 -implementation 77 one fish&inverted indexing&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -task&reducer&AGGREGATION -implementation 77 one fish&inverted indexing&依赖 -1 red d2 1 d3 1 d3 1 simple illustration&large , distributed group&依赖 -implementation 77 one fish&aggregate value&依赖 -implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 -implementation 77 one fish&aggregate value&依赖 -’s point&view •&AGGREGATION -implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 -implementation 77 one fish&inverted indexing&依赖 -MapReduce framework&most heavy lifting&依赖 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&实现 -Doc&hat& -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&依赖 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&依赖 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -Positional&fish& -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&依赖 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&实现 -two fish doc&aggregate value&依赖 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&aggregate value&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 -5&2 H&依赖 -5&2 H&依赖 -5&2 H&依赖 -5&2 H&依赖 -5&2 H&依赖 -5&2 H&依赖 -hn1 , f1us , hn2 and f2us&P new List 4&依赖 -hn1 , f1us , hn2 and f2us&] ) 3&依赖 -] do 5&] do 5&依赖 -] do 5&sort ( p ) 7&依赖 -] do 5&] do 5&依赖 -] do 5&] do 5&依赖 -] do 5&sort ( p ) 7&依赖 -] do 5&sort ( p ) 7&依赖 -] do 5&sort ( p ) 7&依赖 -] do 5&baseline inverted indexing algorithm&AGGREGATION -• Reducer&sufficient memory&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop scalability issue • assumption&baseline implementation&AGGREGATION -• Reducer&posting&依赖 -reducer first buffer&posting line 5 )&依赖 -reducer first buffer&line 5 )&依赖 -Key idea&MapReduce framework&依赖 -tuple&same reducer&依赖 -63 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation 64 4.5&62 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation •&依赖 -63 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation 64 4.5&disk directly • caution&依赖 -you&customized partitioner&依赖 -tuple&same reducer&依赖 -2 h&scalable inverted indexing algorithm&依赖 -7&scalable inverted indexing algorithm&AGGREGATION -2 h&MapReduce&依赖 -2 h&MapReduce&依赖 -2 h&7&依赖 -2 h&scalable inverted indexing algorithm&依赖 -you&MapReduce&依赖 -you&graph&依赖 -70 mingshen sun ( cuhk ) mapreduce & hadoop graph representations • two common representation&71 5.1&依赖 -linear algebra • easy algorithmic implementation • large memory space and esp&linear algebra • easy algorithmic implementation • large memory space and esp&依赖 -graph • shortest mean smallest hop count&minimum hop&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -72 mingshen sun ( cuhk ) mapreduce & hadoop dijkstra&’s algorithm&依赖 -graph • shortest mean smallest hop count&minimum hop&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -source node&node&GENERALIZATION -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 -72 mingshen sun ( cuhk ) mapreduce & hadoop dijkstra&’s algorithm&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 -Dijkstra&algorithm& -graph • shortest mean smallest hop count&minimum hop&依赖 -graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 -Figure 5.3&’s algorithm&AGGREGATION -GRAPH algorithm&) 8 13 10 1 n2 n4 8 9 10 1 n2 n4 8 9 10 1 n2 n4 0 5 7 5 2 3 9 7 4 6 n1 0 5 7 5 2 3 9 7 4 6 n1 0 5 7 5 2 3 9 7 4 6 n1 2 n3 n5 2 n3 n5 2 n3 n5&依赖 -GRAPH algorithm&indicate&依赖 -GRAPH algorithm&5 ∞ 2 7 1 n3 n5 5 7 2 7 1 n3 n5 (&依赖 -a ) – (&running&依赖 -a ) – (&algorithm&依赖 -a ) – (&running&依赖 -a ) – (&running&依赖 -a ) – (&algorithm&依赖 -a ) – (&algorithm&依赖 -running&algorithm&AGGREGATION -a ) – (&running&依赖 -a ) – (&algorithm&依赖 -78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&adjacency list )&依赖 -many iteration¶llel bfs ( equal edge weight case )&依赖 -78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&( n&依赖 -78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&emit ( n&依赖 -driver program check&counter value&依赖 -driver program check&counter value&依赖 -driver program check&counter value&依赖 -many iteration¶llel bf&依赖 -amount&time&AGGREGATION -captures notion&page importance&AGGREGATION -random jump • n&graph 84 ↵ pr&依赖 -• One&thousand&AGGREGATION -total number&node&AGGREGATION -probability&random jump • n&AGGREGATION -thousand&feature&AGGREGATION -out-degree&t •&AGGREGATION -• c ( t )&t •&依赖 -random jump • n&node&依赖 -t •&•&GENERALIZATION diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt b/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt deleted file mode 100644 index 52ab29a9ab6190757983ca0211695d08fec6b49e..0000000000000000000000000000000000000000 --- a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt +++ /dev/null @@ -1,1407 +0,0 @@ -MapReduce & Hadoop II -Mingshen Sun -The Chinese University of Hong Kong -mssun@cse.cuhk.edu.hk -Mingshen Sun (CUHK) MapReduce & Hadoop -Outline -• MapReduce Recap -• Design patterns -• in-mapper combing -• pairs and stripes -• order inversion -• value-to-key conversion -2 -Mingshen Sun (CUHK) MapReduce & Hadoop -MapReduce Recap -• Input and output: each a set of key/value pairs. -• Tow functions implemented by users. -• Map (k1, v1) -> list(k2, v2) -• takes an input key/value pair -• produces a set of intermediate key/value pairs -• Reduce (k2, list(v2)) -> list(k3, v3) -• takes a set of values for an intermediate key -• produces a set of output value -• MapReduce framework guarantees that all values associated with -the same key are brought together in the reducer -3 -Mingshen Sun (CUHK) MapReduce & Hadoop -MapReduce Recap -• Optional functions: -• Partition (k’, number of partitions) -> -partition for k’ -• dividing up the intermediate key space and assigning intermediate -key-value pairs to reducers -• often a simple hash of the key, e.g., hash(k’) mod n -• Combine (k2, list(v2)) -> list(k2’, v2’) -• mini-reducers that run in memory after the map phase -• used as an optimization to reduce network traffic -• will be discuss later -4 -Mingshen Sun (CUHK) MapReduce & Hadoop -MapReduce Recap -5 -30 CHAPTER 2. MAPREDUCE BASICS -A α B β C γ D δ E ε F ζ -mapper mapper mapper mapper -a 1 b 2 c 3 c 6 a 5 c 2 b 7 c 8 -combiner combiner combiner combiner -pp pp pp pp -a 1 b 2 c 9 a 5 c 2 b 7 c 8 -partitioner partitioner partitioner partitioner -Shuffle and Sort: aggregate values by keys -a 1 5 b 2 7 c 2 9 8 -p p p p -reducer reducer reducer -X 5 Y 7 Z 9 -Figure 2.4: Complete view of MapReduce, illustrating combiners and partitioners in addition -Mingshen Sun (CUHK) MapReduce & Hadoop -Goals -• Key question: MapReduce provides an elegant -programming model, but how should we recast a multitude -of algorithms into the MapReduce model? -• Goal of this lecture: provide a guide to MapReduce -algorithm design: -• design patterns, which form the building blocks of may problems -6 -Mingshen Sun (CUHK) MapReduce & Hadoop -Challenges -• MapReduce execution framework handles most complicated -details -• e.g., copy intermediate key-value pairs from mappers to reducers -grouped by key during the shuffle and sort stage -• Programmers have little control over MapReduce execution: -• Where a mapper or reducer runs -• When a mapper or reduce begins or finishes -• Which input key-value pairs are processed by a specific mapper -• Which intermediate key-value pairs are processed by a specific -reducer -7 -Mingshen Sun (CUHK) MapReduce & Hadoop -Challenges -• Things that programmers can control: -• Construct complex data structures as keys and -values to store and communicate partial results -• Execute user-specified initialization/termination code in a map or -reduce task -• Preserve state in both mappers and reducers across multiple input -or intermediate keys -• Control sort order of intermediate keys, and hence the order of how -a reducer processes keys -• Control partitioning of key space, and hence the set of keys -encountered by a reducer -8 -Mingshen Sun (CUHK) MapReduce & Hadoop -Challenges -• What we really want… -• No inherent bottlenecks as algorithms are applied to -increasingly large datasets -• linear scalability: an algorithm running on twice the amount of data -should take only twice as long -• an algorithm running on twice the number of nodes should only take -half as long -9 -Mingshen Sun (CUHK) MapReduce & Hadoop -Design Patterns -• Combiners and in-mapper combining -• aggregate map outputs to reduce data traffic being shuffled from -mappers to reducers -• Paris and stripes -• keep track of joint events -• Order inversion -• sort and control the sequence of computation -• Value-to-key conversion -• allow secondary sorting -10 -Mingshen Sun (CUHK) MapReduce & Hadoop -Local Aggregation -• In Hadoop, intermediate results (i.e., map outputs) are -written to local disk before being sent over the network -• network and disk latencies are expensive -• Local aggregation of intermediate results reduces the -number of key-value pairs that need to be shuffled from the -mappers to the reducers -• Default combiner: -• provided by the MapReduce framework -• aggregate map outputs with the same key -• acts like a mini-reducer -11 -Mingshen Sun (CUHK) MapReduce & Hadoop -Word Count: Baseline -• What is the number of records being shuffled? -• without combiners? -• with combiners? -12 -42 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN -1: class Mapper -2: method Map(docid a, doc d) -3: for all term t 2 doc d do -4: Emit(term t, count 1) -1: class Reducer -2: method Reduce(term t, counts [c1, c2, . . .]) -3: sum 0 -4: for all count c 2 counts [c1, c2, . . .] do -5: sum sum + c -6: Emit(term t, count sum) -Figure 3.1: Pseudo-code for the basic word count algorithm in MapReduce (repeated from -Figure 2.3). -The first technique for local aggregation is the combiner, already discussed Section 2.4. Combiners provide a general mechanism within the MapReduce framework -to reduce the amount of intermediate data generated by the mappers—recall that they -can be understood as “mini-reducers” that process the output of mappers. In this -Mingshen Sun (CUHK) MapReduce & Hadoop -Implementation in Hadoop -public class WordCount { -} -13 -public static class TokenizerMapper -extends Mapper{ -private final static IntWritable one = new IntWritable(1); -private Text word = new Text(); -public void map(Object key, Text value, Context context -) throws IOException, InterruptedException { -StringTokenizer itr = new StringTokenizer(value.toString()); -while (itr.hasMoreTokens()) { -word.set(itr.nextToken()); -context.write(word, one); -} -} -} -Mingshen Sun (CUHK) MapReduce & Hadoop -Implementation in Hadoop -public class WordCount { -} -14 -public static class IntSumReducer -extends Reducer { -private IntWritable result = new IntWritable(); -public void reduce(Text key, Iterable values, -Context context -) throws IOException, InterruptedException { -int sum = 0; -for (IntWritable val : values) { -sum += val.get(); -} -result.set(sum); -context.write(key, result); -} -} -Mingshen Sun (CUHK) MapReduce & Hadoop -Implementation in Hadoop -public class WordCount { -} -15 -public static void main(String[] args) throws Exception { -Configuration conf = new Configuration(); -Job job = Job.getInstance(conf, "word count"); -job.setJarByClass(WordCount.class); -job.setMapperClass(TokenizerMapper.class); -job.setCombinerClass(IntSumReducer.class); -job.setReducerClass(IntSumReducer.class); -job.setOutputKeyClass(Text.class); -job.setOutputValueClass(IntWritable.class); -FileInputFormat.addInputPath(job, new Path(args[0])); -FileOutputFormat.setOutputPath(job, new Path(args[1])); -System.exit(job.waitForCompletion(true) ? 0 : 1); -} -Mingshen Sun (CUHK) MapReduce & Hadoop -Usage -• Environment -• Compile & Package -• Run -16 -export JAVA_HOME=/usr/java/default -export PATH=$JAVA_HOME/bin:$PATH -export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools.jar -$ bin/hadoop com.sun.tools.javac.Main WordCount.java -$ jar cf wc.jar WordCount*.class -$ bin/hadoop jar wc.jar WordCount /user/joe/wordcount/ -input /user/joe/wordcount/output -Mingshen Sun (CUHK) MapReduce & Hadoop -Word Count: Version 1 -• in-mapper combining -• emits a key-value pair for each unique term per document -17 -3.1. LOCAL AGGREGATION 1: class Mapper -2: method Map(docid a, doc d) -3: H new AssociativeArray -4: for all term t 2 doc d do -5: H{t} H{t} + 1 . Tally counts for entire document -6: for all term t 2 H do -7: Emit(term t, count H{t}) -Figure 3.2: Pseudo-code for the improved MapReduce word count algorithm that associative array to aggregate term counts on a per-document basis. Reducer is the same Figure 3.1. -This basic idea can be taken one step further, as illustrated in the variant word count algorithm in Figure 3.3 (once again, only the mapper is modified). workings of this algorithm critically depends on the details of how map and tasks in Hadoop are executed, discussed in Section 2.6. Recall, a (Java) mapper is created for each map task, which is responsible for processing a block of input counts for entire document -Mingshen Sun (CUHK) MapReduce & Hadoop -Word Count: Version 2 -• in-mapper combining -• recall a map object is created for each map task -• aggregate all data appearing in the input block processed by the -map task -18 -44 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN -1: class Mapper -2: method Initialize -3: H new AssociativeArray -4: method Map(docid a, doc d) -5: for all term t 2 doc d do -6: H{t} H{t} + 1 . Tally counts across documents -7: method Close -8: for all term t 2 H do -9: Emit(term t, count H{t}) -Figure 3.3: Pseudo-code for the improved MapReduce word count algorithm that demonstrates -the “in-mapper combining” design pattern. Reducer is the same as in Figure 3.1. -For example, Hadoop makes no guarantees on how many times the combiner is applied, -or that it is even applied at all. The combiner is provided as a semantics-preserving -optimization to the execution framework, which has the option of using it, perhaps -multiple times, or not at all (or even in the reduce phase). In some cases (although in this particular example), such indeterminism is unacceptable, which is exactly counts across documents -Setup() in Java -Cleanup() in Java -Mingshen Sun (CUHK) MapReduce & Hadoop -Combiners v.s. In-Mapper Combiners -• Advantages of in-mapper combiners: -• Provide control over where and how local aggregation takes place. -In contrast, semantics of default combiners are underspecified in -MapReduce. -• In-mapper combiners are applied inside the code. Default -combiners are applied inside the map outputs (after being emitted -by the map task). -• Disadvantages: -• States are preserved within mappers -> potentially large memory -overhead. -• algorithmic behavior may depend on the order in which input keyvalue -pairs are encountered - > potential order-dependent bugs. -19 -Mingshen Sun (CUHK) MapReduce & Hadoop -Combiner Design -• Combiner and reducer must share the same signature -• combiner is treated as mini-reducer -• combiner input and output key-value types must match reducer -input key-value type -• Remember: combiner are optional optimizations -• with/without combiner should not affect algorithm correctness -• may be run 0, 1, or multiple times, determined by the MapReduce -execution framework -• In Java, you can specify the combiner class as: -• public void setCombinerClass(Class cls) -• exactly the Reducer type -20 -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing the Mean: Version 1 -• Any drawback? -• Can we use reducer as combiner? -• i.e., set combiner class to be reducer class -21 -3.1. LOCAL AGGREGATION 47 -1: class Mapper -2: method Map(string t, integer r) -3: Emit(string t, integer r) -1: class Reducer -2: method Reduce(string t, integers [r1, r2, . . .]) -3: sum 0 -4: cnt 0 -5: for all integer r 2 integers [r1, r2, . . .] do -6: sum sum + r -7: cnt cnt + 1 -8: ravg sum/cnt -9: Emit(string t, integer ravg) -Figure 3.4: Pseudo-code for the basic MapReduce algorithm that computes the mean of values -associated with the same key. -of values associated with the same key, and the reducer would compute the mean of -those values. As a concrete example, we know that: -Mean(1, 2, 3, 4, 5) 6= Mean(Mean(1, 2),Mean(3, 4, 5)) -Pseudo-code for the basic -MapReduce algorithm that -computes the mean of -values associated with the -same key. -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing the Mean: Version 1 -• Mean of the means is not the original mean. -• e.g., -• mean(1, 2, 3, 4, 5) != mean(mean(1, 2), mean(3, 4, 5)) -• It’s not a problem for Word Count problem, but it’s a -problem here. -22 -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing the Mean: Version 2 -• Does it work? Why? -• recall that combiners must have the same input and output keyvalue -type -• Why? -• combiners are optimizations that cannot change the correctness of -the algorithm -23 -48 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN -1: class Mapper -2: method Map(string t, integer r) -3: Emit(string t, integer r) -1: class Combiner -2: method Combine(string t, integers [r1, r2, . . .]) -3: sum 0 -4: cnt 0 -5: for all integer r 2 integers [r1, r2, . . .] do -6: sum sum + r -7: cnt cnt + 1 -8: Emit(string t, pair (sum, cnt)) . Separate sum and count -1: class Reducer -2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) -3: sum 0 -4: cnt 0 -5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do -6: sum sum + s -7: cnt cnt + c -8: ravg sum/cnt -9: Emit(string t, integer ravg) -Figure 3.5: Pseudo-code for an incorrect first attempt at introducing combiners to compute -the mean of values associated with each key. The mismatch between combiner input and output -key-value types violates the MapReduce programming model. -3: sum 0 -4: cnt 0 -5: for all integer r 2 integers [r1, r2, . . .] do -6: sum sum + r -7: cnt cnt + 1 -8: Emit(string t, pair (sum, cnt)) . Separate 1: class Reducer -2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) -3: sum 0 -4: cnt 0 -5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do -6: sum sum + s -7: cnt cnt + c -8: ravg sum/cnt -9: Emit(string t, integer ravg) -Figure 3.5: Pseudo-code for an incorrect first attempt at introducing the mean of values associated with each key. The mismatch between combiner key-value types violates the MapReduce programming model. -chapter. We will frequently encounter complex keys and values this book. -Unfortunately, this algorithm will not work. Recall that combiners same input and output key-value type, which also must be the output type and the reducer input type. This is clearly not the why this restriction is necessary in the programming model, remember are optimizations that cannot change the correctness of the algorithm. the combiner and see what happens: the output value type of so the reducer expects to receive a list of integers as values. But expects a list of pairs! The correctness of the algorithm is contingent running on the output of the mappers, and more specifically, that -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing the Mean: Version 3 -• Does it work? Why? -24 -3.1. LOCAL AGGREGATION 49 -1: class Mapper -2: method Map(string t, integer r) -3: Emit(string t, pair (r, 1)) -1: class Combiner -2: method Combine(string t, pairs [(s1, c1), (s2, c2) . . .]) -3: sum 0 -4: cnt 0 -5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do -6: sum sum + s -7: cnt cnt + c -8: Emit(string t, pair (sum, cnt)) -1: class Reducer -2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) -3: sum 0 -4: cnt 0 -5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do -6: sum sum + s -7: cnt cnt + c -8: ravg sum/cnt -9: Emit(string t, integer ravg) -Figure 3.6: Pseudo-code for a MapReduce algorithm that computes the mean of values associated -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing the Mean: Version 4 -• Does it work? -• Do we need a combiner? -25 -50 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN -1: class Mapper -2: method Initialize -3: S new AssociativeArray -4: C new AssociativeArray -5: method Map(string t, integer r) -6: S{t} S{t} + r -7: C{t} C{t} + 1 -8: method Close -9: for all term t 2 S do -10: Emit(term t, pair (S{t}, C{t})) -Figure 3.7: Pseudo-code for a MapReduce algorithm that computes the mean of values associated -with each key, illustrating the in-mapper combining design pattern. Only the mapper is -shown here; the reducer is the same as in Figure 3.6 -and one. The reducer would still arrive at the correct sum and count, and hence the -mean would be correct. Now add in the combiners: the algorithm would remain correct, -no matter how many times they run, since the combiners merely aggregate partial sums -and counts to pass along to the reducers. Note that although the output key-value type -of the combiner must be the same as the input key-value type of the reducer, the reducer -Mingshen Sun (CUHK) MapReduce & Hadoop -Pairs and Stripes -• To illustrate how constructing complex keys and values -improves the performance of computation. -26 -Mingshen Sun (CUHK) MapReduce & Hadoop -A New Running Example -• Problem: building a word co-occurrence matrix over a text -collection -• M = n * n matrix (n = number of unique words) -• m[i][j] = number of times word w[i] co-occurs with word w[j] within a -specific context (e.g., same sentence, same paragraph, same -document) -• it is easy to show that m[i][j] == m[j][i] -• Why this problem is interesting? -• distributional profiles of words -• information retrieval -• statistical natural language processing -27 -Mingshen Sun (CUHK) MapReduce & Hadoop -Challenge -• Space requirement: O(n^2). -• too big if we simply store the whole matrix with billions of words in -memory -• a single machine typically cannot keep the whole matrix -• How to use MapReduce to implement this large counting -problem? -• Our approach: -• mappers generate partial counts -• reducers aggregate partial counts -28 -Mingshen Sun (CUHK) MapReduce & Hadoop -Pairs -• Each mapper: -• Emits intermediate key-value pairs with each co-occurring word pair -and integer 1 -• Each reducer: -• Sums up all values associated with the same co-occurring word pair -• MapReduce execution framework guarantees that all values -associated with the same key are brought together in the reducer -29 -Mingshen Sun (CUHK) MapReduce & Hadoop -Pairs -• Can we use the default combiner here? -30 -3.2. PAIRS AND STRIPES 53 -1: class Mapper -2: method Map(docid a, doc d) -3: for all term w 2 doc d do -4: for all term u 2 Neighbors(w) do -5: Emit(pair (w, u), count 1) . Emit count for each co-occurrence -1: class Reducer -2: method Reduce(pair p, counts [c1, c2, . . .]) -3: s 0 -4: for all count c 2 counts [c1, c2, . . .] do -5: s s + c . Sum co-occurrence counts -6: Emit(pair p, count s) -Figure 3.8: Pseudo-code for the “pairs” approach for computing word co-occurrence matrices -from large corpora. -1: class Mapper -2: method Map(docid a, doc d) -Mingshen Sun (CUHK) MapReduce & Hadoop -Stripes -• Each mapper: -• For each particular word, stores co-occurrence information in an -associative array -• Emits intermediate key-value pairs with words as keys and -corresponding associative arrays as values -• Each reducer: -• Sums all the counts in the associative arrays -• MapReduce execution framework guarantees that all associative -arrays with the same key are brought together in the reducer -31 -Mingshen Sun (CUHK) MapReduce & Hadoop -Stripes -• Example: -• Each mapper emits -• a -> {b: count(b), c: count(c), d: count(d) …} -• Reducers perform element-wise sum of associative arrays -32 -(a, b) -> 1 -(a, c) -> 2 -(a, d) -> 5 -(a, e) -> 3 -(a, f) -> 2 -a -> {b: 1, c: 2, d: 5, e: 3, f: 2} -a -> {b: 1, , d: 5, e: 3 } -+ a -> {b: 1, c: 2, d: 2, f: 2} -———————————————————————————————————————— -a -> {b: 2, c: 2, d: 7, e: 3, f: 2} -Mingshen Sun (CUHK) MapReduce & Hadoop -Stripes -• pseudo-code of stripes approach -33 -1: class Mapper -2: method Map(docid a, doc d) -3: for all term w 2 doc d do -4: H new AssociativeArray -5: for all term u 2 Neighbors(w) do -6: H{u} H{u} + 1 . Tally words co-occurring with w -7: Emit(Term w, Stripe H) -1: class Reducer -2: method Reduce(term w, stripes [H1,H2,H3, . . .]) -3: Hf new AssociativeArray -4: for all stripe H 2 stripes [H1,H2,H3, . . .] do -5: Sum(Hf,H) . Element-wise sum -6: Emit(term w, stripe Hf ) -Figure 3.9: Pseudo-code for the “stripes” approach for computing word co-occurrence matrices -from large corpora. -Mingshen Sun (CUHK) MapReduce & Hadoop -Pairs v.s. Stripes -• Pairs: -• Pro: Easy to understand and implement -• Con: Generate many key-value pairs -• Stripes: -• Pro: Generate fewer key-value pairs -• Pro: Make better use of combiners -• Con: Memory size of associative arrays in mappers could be huge -• Both pairs and stripes can apply in-mapper combining -34 -Mingshen Sun (CUHK) MapReduce & Hadoop -Pairs v.s. Stripes -• stripes much faster than pairs -• linearity is maintained -35 -56 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN -0 -500 -1000 -1500 -2000 -2500 -3000 -3500 -4000 -0 20 40 60 80 100 -Running time (seconds) -Percentage of the APW corpus -R2 = 0.992 -R2 = 0.999 -"stripes" approach -"pairs" approach -Figure 3.10: Running time of the “pairs” and “stripes” algorithms for computing word cooccurrence -matrices on di↵erent fractions of the APW corpus. These experiments were performed -on a Hadoop cluster with 19 slaves, each with two single-core processors and two disks. -5000 -Mingshen Sun (CUHK) MapReduce & Hadoop -Relative Frequencies -• Drawback of co-occurrence counts -• absolute counts doesn’t consider that some words appear more -frequently than others -• e.g., “is” occurs very often by itself -• doesn’t imply “is good” occurs more frequently than “Hello World” -• Estimate relative frequencies instead of counts -• How do we apply MapReduce to this problem? -36 -Relative Frequencies -Drawback of co-occurrence counts: -• Absolute counts doesn’t consider that some words -appear more frequently than others -• e.g., “is” occurs very often by itself. It doesn’t imply -“is good” occurs more frequently than “Hello World” -Estimate relative frequencies instead of counts: -How do we MapReduce to this problem? -31 -􀂦 -􀀠 􀀠 -' -count ( , ' ) -count ( , ) -count ( ) -count ( , ) -( | ) -B -A B -A B -A -A B -f B A -marginal -Mingshen Sun (CUHK) MapReduce & Hadoop -Relative Frequencies -• Computing relative frequencies with the stripes approach is -straightforward -• Sum all the counts in the associative array for each word -• Why is it possible in MapReduce? -• Drawback: assuming that each associative array fits into memory -• How to compute relative frequencies with the pairs -approach? -37 -Mingshen Sun (CUHK) MapReduce & Hadoop -Relative Frequencies with Pairs -• Mapper emits (a, *) for every word being observed -• Mapper makes sure same word goes to the same reducer -(use partitioner) -• Mapper makes suer (a, *) comes first, before individual -counts (how?) -• Reducer holds state to remember the count of (a, *), until all -pairs with the word “a” have been computed -38 -(a, *) -> 32 -(a, b1) -> 3 -(a, b2) -> 12 -(a, b3) -> 7 -(a, b4) -> 1 -… -reducer holds this value in -memory -(a, b1) -> 3/32 -(a, b2) -> 12/32 -(a, b3) -> 7/32 -(a, b4) -> 1/32 -… -Mingshen Sun (CUHK) MapReduce & Hadoop -Order Inversion -• Why order inversion? -• Computing relative frequencies requires marginal counts -• But marginal cannot be computed until you see all counts -• Buffering is a bad idea! -• Trick: getting the marginal counts to arrive at the reducer before the -joint counts -• MapReduce allows you to define the order of keys being -processed by the reducer -• shuffle and sort -39 -Mingshen Sun (CUHK) MapReduce & Hadoop -Order Inversion: Idea -• How to use the design pattern of order inversion to compute -relative frequencies via the pair approach? -• Emit a special key-value pair for each co-occurring word for the -computation of marginal -• Control the sort order of the intermediate key so that the marginal -count comes before individual counts -• Define a custom partitioner to ensure all pairs with the same left -word are shuffled to the same reducer -• Preserve state in reducer to remember the marginal count for each -word -40 -Mingshen Sun (CUHK) MapReduce & Hadoop -Secondary Sorting -• MapReduce sorts input to reducers by key -• values may be arbitrarily ordered -• What if want to sort value also? -• Scenario: -• sensors record temperature over time -• each sensor emits (id, time t, temperature v) -41 -Mingshen Sun (CUHK) MapReduce & Hadoop -Secondary Sorting -• Naive solution -• each sensor emits -• id -> (t, v) -• all readings of sensor id will be aggregated into a reducer -• buffer values in memory for all id, then sort -• Why is this a bad idea? -42 -Mingshen Sun (CUHK) MapReduce & Hadoop -Secondary Sorting -• Value-to-key conversion -• each mapper emits -• (id, t) -> v -• let execution framework do the sorting -• preserve state across multiple key-value pairs to handle processing -• anything else? -• Main idea: sorting is offloaded from the reducer (in naive -approach) to the MapReduce framework -43 -Mingshen Sun (CUHK) MapReduce & Hadoop -Tools for Synchronization -• Cleverly-constructed data structures -• Bring data together -• Sort order of intermediate keys -• Control order in which reducers process keys -• Partitioner -• Control which reducer processes which keys -• Preserving state in mappers and reducers -• Capture dependencies across multiple keys and values -44 -Mingshen Sun (CUHK) MapReduce & Hadoop -Issues and Tradeoffs -• Number of key-value pairs -• Object creation overhead -• Time for sorting and shuffling pairs across the network -• Size of each key-value pair -• De/serialization overhead -• Local aggregation -• Opportunities to perform local aggregation varies -• Combiners make a big difference -• Combiners vs. in-mapper combining -• RAM vs. disk vs. network -45 -Mingshen Sun (CUHK) MapReduce & Hadoop -Debugging at Scale -• Works on small datasets, won’t scale... why? -• Memory management issues (buffering and object -creation) -• Too much intermediate data -• Mangled input records -• Real-world data is messy! -• Word count: how many unique words in Wikipedia? -• There’s no such thing as “consistent data” -• Watch out for corner cases -• Isolate unexpected behavior, bring local -46 -Mingshen Sun (CUHK) MapReduce & Hadoop -Summary -• Design patterns -• in-mapper combing -• pairs and stripes -• order inversion -• value-to-key conversion -47 -Mingshen Sun (CUHK) MapReduce & Hadoop -MapReduce Application -• Text retrieval -• inverted indexing -• Data mining -• TF-IDF -• Graph algorithm -• parallel breadth-first search -• parallel dijkstra’s algorithm -• PageRank -48 -Mingshen Sun (CUHK) MapReduce & Hadoop -Web Search Problem -• Web search is to retrieve relevant web objects -• e.g., web pages, PDFs, PPT slides -• Web search problem -• crawling: gathering web content -• indexing: constructing search indexing structure -• retrieval: ranking documents given a query -• Challenge -• the web is huge -• billions of web objects, terabytes of information -• Performance goals -• query latency needs to be small -• scalable for a large number of documents -49 -Mingshen Sun (CUHK) MapReduce & Hadoop -Inverted Indexes -• Inverted Index -• A data structure that given a term provides access to the list of -documents that contain the term -• Used by most full-text search engines today -• By documents, we mean web objects -• Retrieval engine uses the inverted index to score documents -that contain the query terms based on some ranking model -• e.g., based on term matches, term proximity, term attributes, etc. -50 -Mingshen Sun (CUHK) MapReduce & Hadoop -Inverted Indexes -• Simple illustration of an inverted index. -• Each term is associated with a list of postings. -• Each posting is comprised of a document id and a payload, denoted -by p in this case. -• An inverted index provides quick access to documents ids that -contain a term. -51 -74 CHAPTER 4. INVERTED INDEXING FOR TEXT RETRIEVAL -terms postings -term1 -term2 -term3 -d1 p d5 p d6 p d11 p -d11 p d23 p d59 p d84 p -3 d1 p d4 p d11 p d19 p -1 4 11 19 Figure 4.1: Simple illustration of an inverted index. Each term is associated with a list postings. Each posting is comprised of a document id and a payload, denoted by p in this case. -An inverted index provides quick access to documents ids that contain a term. -the front of a postings list. Either way, an auxiliary data structure is necessary maintain the mapping from integer document ids to some other more meaningful handle, -such as a URL. -Given a query, retrieval involves fetching postings lists associated with query terms -and traversing the postings to compute the result set. In the simplest case, boolean -Mingshen Sun (CUHK) MapReduce & Hadoop -Inverted Indexes -• Given a query, retrieval involves fetching postings lists -associated with query terms and traversing the postings to -compute the result set. -• Simple Boolean retrieval: -• Apply union (OR) or intersection (AND) of posting lists -• General retrieval: -• Document scores are ranked -• Top k documents are returned -52 -Mingshen Sun (CUHK) MapReduce & Hadoop -Inverted Indexes -53 -one$fish,$two$fish -Doc$1 -red$fish,$blue$fish -Doc$2 -cat$in$the$hat -Doc$3 -1 -1 -1 -1 -1 -1 -1 2 3 -1 -1 -1 -4 -blue -cat -egg -fish -green -ham -hat -one -3 -4 -1 -4 -4 -3 -2 -1 -blue -cat -egg -fish -green -ham -hat -one -2 -green$eggs$and$ham -Doc$4 -red 1 -two 1 -red 2 -two 1 -Mingshen Sun (CUHK) MapReduce & Hadoop -Inverted Indexes: Construction -• How to construct an inverted index? -• Naive approach: -• For each document, extract all useful terms, and exclude all -stopwords (e.g., “the”, “a”, “of”) and remove affixes (e.g., “dogs” to -“dog”) -• For each term, add the posting (document, payload) to an existing -list, or create a posting list if the term is new -• Clearly, naive approach is not scalable if the document -collection is huge and each document is large -• Can we use MapReduce? -54 -Mingshen Sun (CUHK) MapReduce & Hadoop -Baseline Implementation -• Our goal: construct an inverted index given a document -collection -• Main idea: -• Input to each mapper: -• Document IDs (keys) -• Actual document content (values) -• What each mapper does: -• Analyze each document and extract useful terms -• Compute term frequencies (per document) -• Emit (term, posting) -• What each reducer does -• Aggregates all observed postings for each term -• Construct the posting list -55 -Mingshen Sun (CUHK) MapReduce & Hadoop -Baseline Implementation -56 -4.5. INDEX COMPRESSION 79 -1: class Mapper -2: method Map(docid n, doc d) -3: H new AssociativeArray -4: for all term t 2 doc d do -5: H{t} H{t} + 1 -6: for all term t 2 H do -7: Emit(tuple ht, ni, tf H{t}) -1: class Reducer -2: method Initialize -3: tprev ; -4: P new PostingsList -5: method Reduce(tuple ht, ni, tf [f]) -6: if t 6= tprev ^ tprev 6= ; then -7: Emit(term t, postings P) -8: P.Reset() -9: P.Add(hn, fi) -10: tprev t -11: method Close -12: Emit(term t, postings P) -Mingshen Sun (CUHK) MapReduce & Hadoop -Baseline Implementation -57 -4.4. INVERTED INDEXING: REVISED IMPLEMENTATION 77 -one fish, two fish -doc 1 -red fish, blue fish -doc 2 -one red bird -doc 3 -mapper mapper mapper -fish d1 2 -one d1 1 -two d1 1 -blue d2 1 -fish d2 2 -red d2 1 -bird d3 1 -one d3 1 -red d3 1 -reducer -Shuffle and Sort: aggregate values by keys -reducer fish d1 2 d2 2 bird d3 1 -one d1 1 -two d1 1 -blue d2 1 -red d2 1 d3 1 -d3 1 -Simple illustration of the baseline inverted indexing algorithm in MapReduce with -Mingshen Sun (CUHK) MapReduce & Hadoop -Baseline Implementation -• In the shuffle and sort phase, MapReduce framework forms -a large, distributed group by the postings of each term -• From reducer’s point of view -• Each input to the reducer is the resulting posting list of a term -• Reducer may sort the list (if needed), and writes the final output to -disk -• The task of each reducer is greatly simplified! MapReduce -framework has done most heavy liftings. -58 -Mingshen Sun (CUHK) MapReduce & Hadoop -Positional Indexes -59 -1 -1 -2 -1 -1 -2 2 -1 -1 -1 -1 -1 -1 -1 -1 -2 -one 1 -two 1 -fish 1 -one fish, two fish -Doc 1 -red 2 -blue 2 -fish 2 -red fish, blue fish -Doc 2 -cat 3 -hat 3 -cat in the hat -Doc 3 -fish 1 2 -one 1 -two 1 -red 2 -cat 3 -blue 2 -hat 3 -Shuffle and Sort: aggregate values by keys -Map -Reduce -Mingshen Sun (CUHK) MapReduce & Hadoop -Scalability Issue -• Scalability problem in baseline implementation -60 -4.3. INVERTED INDEXING: BASELINE IMPLEMENTATION 1: class Mapper -2: procedure Map(docid n, doc d) -3: H new AssociativeArray -4: for all term t 2 doc d do -5: H{t} H{t} + 1 -6: for all term t 2 H do -7: Emit(term t, posting hn,H{t}i) -1: class Reducer -2: procedure Reduce(term t, postings [hn1, f1i, hn2, f2i . . .]) -3: P new List -4: for all posting ha, fi 2 postings [hn1, f1i, hn2, f2i . . .] do -5: Append(P, ha, fi) -6: Sort(P) -7: Emit(term t, postings P) -Figure 4.2: Pseudo-code of the baseline inverted indexing algorithm in MapReduce. Any problem? -Mingshen Sun (CUHK) MapReduce & Hadoop -Scalability Issue -• Assumption of baseline implementation: -• Reducer has sufficient memory to hold all postings associated with -the same term -• Why? -• The MapReduce framework makes no guarantees about the -ordering of values associated with the same key. -• The reducer first buffers all postings (line 5) and then performs an -in-memory sort before writing the postings to disk -61 -Mingshen Sun (CUHK) MapReduce & Hadoop -Scalability Issue -• How to solve? Key idea is to let MapReduce framework do -sorting for us -• Instead of emitting -• (term t, posting ) -• Emit -• (tuple , f) -• Value-to-key conversion!! -62 -Mingshen Sun (CUHK) MapReduce & Hadoop -Revised Implementation -• With value-to-key conversion, the MapReduce framework -ensures the postings arrive in sorted order (based on ) -• Results can be written to disk directly -• Caution: you need a customized partitioner to ensure that all -tuples with the same term are shuffled to the same reducer -63 -Mingshen Sun (CUHK) MapReduce & Hadoop -Revised Implementation -64 -4.5. INDEX COMPRESSION 79 -1: class Mapper -2: method Map(docid n, doc d) -3: H new AssociativeArray -4: for all term t 2 doc d do -5: H{t} H{t} + 1 -6: for all term t 2 H do -7: Emit(tuple ht, ni, tf H{t}) -1: class Reducer -2: method Initialize -3: tprev ; -4: P new PostingsList -5: method Reduce(tuple ht, ni, tf [f]) -6: if t 6= tprev ^ tprev 6= ; then -7: Emit(term t, postings P) -8: P.Reset() -9: P.Add(hn, fi) -10: tprev t -11: method Close -12: Emit(term t, postings P) -Figure 4.4: Pseudo-code of a scalable inverted indexing algorithm in MapReduce. By applying -results are directly written to -disk -Mingshen Sun (CUHK) MapReduce & Hadoop -TF-IDF -• Term Frequency – Inverse Document Frequency (TF-IDF) -• Answers the question “How important is this term in a document” -• Known as a term weighting function -• Assigns a score (weight) to each term (word) in a document -• Very commonly used in text processing and search -• Has many applications in data mining -65 -Mingshen Sun (CUHK) MapReduce & Hadoop -TF-IDF Motivation -• Merely counting the number of occurrences of a word in a -document is not a good enough measure of its relevance -• If the word appears in many other documents, it is probably less -relevance -• Some words appear too frequently in all documents to be relevant -• Known as ‘stopwords’ -• TF-IDF considers both the frequency of a word in a given -document and the number of documents which contain the -word -66 -Mingshen Sun (CUHK) MapReduce & Hadoop -TF-IDF: Definition -• Term Frequency (TF) -• Number of times a term appears in a -• document (i.e., the count) -• Inverse Document Frequency (IDF) -• N: total number of documents -• n: number of documents that contain a term -• TF-IDF -• TF × IDF -67 -idf = log ( -N -n -) -Mingshen Sun (CUHK) MapReduce & Hadoop -Computing TF-IDF With MapReduce -• Overview of algorithm: 3 MapReduce jobs -• Job 1: compute term frequencies -• Job 2: compute number of documents each word -occurs in -• Job 3: compute TD-IDF -68 -Mingshen Sun (CUHK) MapReduce & Hadoop -Graph: Real-World Problems -• Finding shortest paths -• Routing Internet traffic and UPS trucks -• Finding minimum spanning trees -• Telco laying down fiber -• Finding Max Flow -• Airline scheduling -• Identify “special” nodes and communities -• Breaking up terrorist cells, spread of avian flu -• Bipartite matching -• Monster.com, Match.com -• PageRank -69 -Mingshen Sun (CUHK) MapReduce & Hadoop -Graphs and MapReduce -• Graph algorithms typically involve: -• Performing computations at each node: based on -node features, edge features, and local link structure -• Propagating computations: “traversing” the graph -• Challenge: -• Algorithms running on a single machine and putting -the entire graph in memory are not scalable -• Key questions: -• How do you represent graph data in MapReduce? -• How do you traverse a graph in MapReduce? -70 -Mingshen Sun (CUHK) MapReduce & Hadoop -Graph Representations -• Two common representations -• adjacency matrix -• adjacency list -71 -5.1. GRAPH REPRESENTATIONS n1 -n2 -n1 n2 n3 n4 n5 -n1 0 1 0 1 0 -n2 0 0 1 0 1 -n1 [n2, n4] -n2 [n3, n5] -n3 -n5 -n3 0 0 0 1 0 -n4 0 0 0 0 1 -n5 1 1 1 0 0 -n3 [n4] -n4 [n5] -n5 [n1, n2, n3] -n4 adjacency matrix adjacency lists -Figure 5.1: A simple directed graph (left) represented as an adjacency matrix (middle) and -adjacency lists (right). -parallel breadth-first search (Section 5.2) and PageRank (Section 5.3). Before concluding -• easy to manipulate with linear algebra -• easy algorithmic implementation -• large memory space, esp. for sparse -graph -• much more compact representation -• easy to compute over out-links -• much more difficult to compute over -in-links -• How ever, the shuffle and sort -mechanism in MapReduce provides -an easy way to group edges by -destination nodes. -Mingshen Sun (CUHK) MapReduce & Hadoop -Single-Source Shortest Path -• Problem: find shortest paths from a source node to all other -nodes in the graph -• Shortest mean smallest hop counts or lowest weights -• Algorithm: -• Breadth-first-search: for finding minimum hop counts -• Dijkstra’s algorithm: for finding minimum-cost paths for general -graphs -72 -Mingshen Sun (CUHK) MapReduce & Hadoop -Dijkstra’s Algorithm -73 -96 CHAPTER 5. GRAPH ALGORITHMS -∞ 1 ∞ -n2 n4 -10 1 ∞ -n2 n4 -8 1 14 -n2 n4 -0 -10 -5 -2 3 -9 -4 6 -n1 -0 -10 -5 -2 3 -9 -4 6 -n1 -0 -10 -5 -2 3 -9 -4 6 -n1 -∞ ∞ -2 -7 1 -n3 n5 -5 ∞ -2 -7 1 -n3 n5 -5 7 -2 -7 1 -n3 n5 -(a) (b) (c) -8 13 -10 -1 -n2 n4 -8 9 -10 -1 -n2 n4 -8 9 -10 -1 -n2 n4 -0 -5 7 -5 -2 3 -9 -7 -4 6 -n1 -0 -5 7 -5 -2 3 -9 -7 -4 6 -n1 -0 -5 7 -5 -2 3 -9 -7 -4 6 -n1 -2 -n3 n5 -2 -n3 n5 -2 -n3 n5 -(d) (e) (f) -Figure 5.3: Example of Dijkstra’s algorithm applied to a simple graph with five nodes, with n1 -as the source and edge distances as indicated. Parts (a)–(e) show the running of the algorithm -at each iteration, with the current distance inside the node. Nodes with thicker borders are -those being expanded; nodes that have already been expanded are shown in black. -Mingshen Sun (CUHK) MapReduce & Hadoop -Dijkstra’s Algorithm -• Dijkstra’s algorithm is designed as a sequential algorithm -• Key to Dijkstra’s algorithm -• Priority queue that maintains a globally sorted list of nodes by -current distance -• Not possible in MapReduce, which doesn’t provide a mechanism for -exchanging global data -• Solution: -• Brute-force approach: parallel breadth first search -• Brute force: Try to revisit many nodes that have been visited -74 -Mingshen Sun (CUHK) MapReduce & Hadoop -Parallel BFS -• Consider simple case of equal edge weights -• Solution to the problem can be defined inductively -• Here’s the intuition: -• Define: b is reachable from a if b is on adjacency list of a -• DistanceTo(s) = 0 -• For all nodes p reachable from s, DistanceTo(p) = 1 -• For all nodes n reachable from some other set of nodes M, -DistanceTo(n) = 1 + min(DistanceTo(m), m \in M) -75 -s -m3 -m2 -m1 -n -… -… -… -d1 -d2 -d3 -Mingshen Sun (CUHK) MapReduce & Hadoop -Visualizing Parallel BFS -76 -n0 -n3 n2 -n1 -n7 -n6 -n5 -n4 -n9 -n8 -Mingshen Sun (CUHK) MapReduce & Hadoop -From Intuition to Algorithm -• Data representation: -• Key: node n -• Value: d (distance from start), adjacency list (nodes reachable from -n) -• Initialization: for all nodes except for start node, d = infinity -• Mapper: -• exit m in adjacency list: emit (m, d + 1) -• Sort/Shuffle -• Groups distances by reachable nodes -• Reducer: -• Selects minimum distance path for each reachable node -• Additional bookkeeping needed to keep track of actual path -77 -Mingshen Sun (CUHK) MapReduce & Hadoop -Multiple Iterations Needed -• Each MapReduce iteration advances the “frontier” by one -hop -• Subsequent iterations include more and more reachable nodes as -frontier expands -• Multiple iterations are needed to explore entire graph -• Preserving graph structure: -• Problem: Where did the adjacency list go? -• Solution: mapper emits (n, adjacency list) as well -78 -Mingshen Sun (CUHK) MapReduce & Hadoop -BFS Pseudo-Code -79 -Mingshen Sun (CUHK) MapReduce & Hadoop -Stopping Criterion -• How many iterations are needed in parallel BFS (equal edge -weight case)? -• Convince yourself: when a node is first “discovered”, we’ve -found the shortest path -• In practice, we iterate the algorithm until all node distances -are found (i.e., no more infinity) -• How? -• Maintain a counter inside the MapReduce program (i.e., count how -many node distances are found) -• Require a non-MapReduce driver program to submit a MapReduce -job to iterate the algorithm -• The driver program checks the counter value before submitting -another job -80 -Mingshen Sun (CUHK) MapReduce & Hadoop -Extend to General Weights -• Difference? -• How many iterations are needed in parallel BFS? -• How do we know that all shortest path distances are found? -81 -Mingshen Sun (CUHK) MapReduce & Hadoop -Other Graph Algorithms -• PageRank -• Subgraph pattern matching -• Computing simple graph statistics -• Degree vertex distributions -• Computing more complex graph statics -• Clustering coefficient -• Counting triangles -82 -Mingshen Sun (CUHK) MapReduce & Hadoop -Random Walks Over the Web -• Random surfer model: -• User starts at a random Web page -• User randomly clicks on links, surfing from page to page -• PageRank -• Characterizes the amount of time spent on any given page -• Mathematically, a probability distribution over pages -• PageRank captures notions of page importance -• Correspondence to human intuition? -• One of thousands of features used in web search (queryindependent) -83 -Mingshen Sun (CUHK) MapReduce & Hadoop -PageRank: Definition -• Given page x with inlinks t1…tn, where -• C(t) is the out-degree of t -• is probability of random jump -• N is the total number of nodes in the graph -84 -↵ -PR(x) = ↵ -✓ -1 -N -◆ -+ (1 \ No newline at end of file diff --git a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls b/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls deleted file mode 100644 index a58c00c287c619eed922f553c0dccdd2a00a4251..0000000000000000000000000000000000000000 Binary files a/src/main/resources/sdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls and /dev/null differ diff --git a/src/main/resources/sdtocode/sd/sd-Hadoop HDFS1.txt b/src/main/resources/sdtocode/sd/sd-Hadoop HDFS1.txt index 729651169b560505a20ca0d6efe74e82a3c172f9..4b9016670fab689fffa36193490bb79c89c6e22a 100644 --- a/src/main/resources/sdtocode/sd/sd-Hadoop HDFS1.txt +++ b/src/main/resources/sdtocode/sd/sd-Hadoop HDFS1.txt @@ -1,18 +1,12 @@ -DFSOutputStream -@DFSOutputStream -%NameNode -%addBlock() -¥DFSOutputStream -%DataNode -%opWriteBlock()¥ -@#NameNode -@NameNode -%DFSOutputStream -%return nodes -¥@#DataNode -@DataNode -%NameNode -%blockReceived -¥DataNode -%DFSOutputStream -%¥@# \ No newline at end of file +addBlock()%DFSOutputStream%NameNode%false¥ +return nodes%NameNode%DFSOutputStream%true¥ +opWriteBlock()%DFSOutputStream%DataNode%false¥ +opWriteBlock()%DataNode%DataNode%false¥ +opWriteBlock()%DataNode%DataNode%false¥ +blockReceived()%DataNode%NameNode%true¥ +%DataNode%DFSOutputStream%true¥ +blockReceived()%DataNode%NameNode%true¥ +%DataNode%DFSOutputStream%true¥ +blockReceived()%DataNode%NameNode%true¥ +%DataNode%DFSOutputStream%true¥ +%NameNode%DFSOutputStream%true¥ \ No newline at end of file diff --git a/src/main/resources/sdtocode/sd/sd-Hadoop HDFS2.txt b/src/main/resources/sdtocode/sd/sd-Hadoop HDFS2.txt index c187e10d618e79b14b9b283370a7abba83039b52..594d0a40d98b6d5750fc854f3823c7a7504ba424 100644 --- a/src/main/resources/sdtocode/sd/sd-Hadoop HDFS2.txt +++ b/src/main/resources/sdtocode/sd/sd-Hadoop HDFS2.txt @@ -1,38 +1,12 @@ -DFSOutputStream -@DFSOutputStream -%DataStreamer -%run() -¥@#DataStreamer -@DataStreamer -%DataNodeInfo -%nextBlockOutputStream() -¥DataStreamer -%DataNode -%DataTransferProtocol.Sender.opWriteBlock() -¥@#DataNodeInfo -@DataNodeInfo -%LocatedBlock -%locateFollowingBlock() -¥DataNodeInfo -%DataStreamer -%¥@#LocatedBlock -@LocatedBlock -%NameNode -%dfsClient.namenode.addBlock() -¥LocatedBlock -%DataNodeInfo -%return LocatedBlock -¥@#NameNode -@NameNode -%LocatedBlock -%return LocatedBlock -¥@#DataNode -@DataNode -%DataTransferProtocol.Sender -%write -¥@#DataTransferProtocol.Sender -@DataTransferProtocol.Sender -%DataStreamer -%¥DataTransferProtocol.Sender -%NameNode -%blockReceived¥@# \ No newline at end of file +run()%DFSOutputStream%DataStreamer%false¥ +nextBlockOutputStream()%DataStreamer%DataNodeInfo%false¥ +locateFollowingBlock%DataNodeInfo%LocatedBlock%false¥ +dfsClient.namenode.addBlock()%LocatedBlock%NameNode%false¥ +return LocatedBlock%NameNode%LocatedBlock%true¥ +return LocatedBlock%LocatedBlock%DataNodeInfo%true¥ +return LocatedBlock%DataNodeInfo%DataStreamer%true¥ +DataTransferProtocol.Sender.opWriteBlock()%DataStreamer%DataNode%false¥ +write%DataNode%DataTransferProtocol:Sender%false¥ +ack from DataNode to DataStreamer%DataTransferProtocol:Sender%DataStreamer%true¥ +blockReceived%DataTransferProtocol:Sender%DataNode%true¥ +return nodes%DataNodeInfo%DataStreamer%true¥ \ No newline at end of file diff --git a/src/main/resources/sdtocode/sd/sd-Hadoop MapReduce.txt b/src/main/resources/sdtocode/sd/sd-Hadoop MapReduce.txt index 3735143fa8d19bf79a1d8924b63bed455c0a139c..0fe18ea6cebf884797198a359fca32b6097a8aba 100644 --- a/src/main/resources/sdtocode/sd/sd-Hadoop MapReduce.txt +++ b/src/main/resources/sdtocode/sd/sd-Hadoop MapReduce.txt @@ -1,25 +1,7 @@ -JobClient -@JobClient -%JobTracker -%getNewJobId() -¥JobClient -%HDFS -%mkdirs() -¥JobClient -%HDFS -%copyRemoteFiles() -¥JobClient -%HDFS -%writeSplits() -¥JobClient -%HDFS -%writeXml() -¥JobClient -%HDFS -%submitJob(job) -¥@#JobTracker -@JobTracker -%JobClient -%jobId -¥@#HDFS -@@# \ No newline at end of file +getNewJobId()%JobClient%JobTracker%false¥ +jobId%JobTracker%JobClient%true¥ +mkdirs()%JobClient%HDFS%false¥ +copyRemoteFiles()%JobClient%HDFS%false¥ +writeSplits()%JobClient%HDFS%false¥ +writeXml()%JobClient%HDFS%false¥ +submitJob(job)%JobClient%JobTracker%false¥ \ No newline at end of file