diff --git a/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java b/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java index d6ae3f901451c0a906cf1dd8a022458c83fd46cd..cd6a0dcefad2dffe1b89a9fcd7097e9b383cf071 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java +++ b/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java @@ -3,6 +3,7 @@ package com.hy.java.uct.cdtocode; import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Set; import com.hy.java.uct.cdtocode.mapper.CodeRelationMapper; import com.hy.java.uct.cdtocode.mapper.DocAnalyzer; @@ -43,13 +44,15 @@ public class CDToCodeTracer { /* * Apache OODT File Manager */ - Map classes_in_CD = CDReader.read(cd_dir + "cd-Apache OODT File Manager.txt"); + // Map classes_in_CD = CDReader.read(cd_dir + "cd-Apache OODT File Manager.txt"); /* * Hadoop HDFS */ + // Map classes_in_CD = CDReader.read(cd_dir + "cd-Hadoop HDFS.txt"); /* * Hadoop MapReduce */ + Map classes_in_CD = CDReader.read(cd_dir + "cd-Hadoop MapReduce.txt"); // 检查结果,可注释掉 // CDReader.check(classes_in_CD); /* @@ -62,22 +65,47 @@ public class CDToCodeTracer { /* * Apache OODT File Manager */ - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\cas-filemgr – CAS File Manager Developer Guide.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Catalog and Archive File Management Component.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\File Manager Scale Out Planning - OODT - Apache Software Foundation.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Interface Ingester.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Mahasen Distributed Storage Resource Broker.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\OODT Filemgr User Guide.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Package org.apache.oodt.cas.filemgr.cli.action.txt"); - doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\React file manager.txt"); + /* + * doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt"); + * doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt"); doc_dir_ls.add(doc_dir + + * "Apache OODT File Manager\\cas-filemgr – CAS File Manager Developer Guide.txt"); doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Catalog and Archive File Management Component.txt"); + * doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\File Manager Scale Out Planning - OODT - Apache Software Foundation.txt"); doc_dir_ls.add(doc_dir + + * "Apache OODT File Manager\\Interface Ingester.txt"); doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Mahasen Distributed Storage Resource Broker.txt"); doc_dir_ls.add(doc_dir + + * "Apache OODT File Manager\\OODT Filemgr User Guide.txt"); doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Package org.apache.oodt.cas.filemgr.cli.action.txt"); doc_dir_ls.add(doc_dir + + * "Apache OODT File Manager\\React file manager.txt"); + */ /* * Hadoop HDFS */ + /* + * doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop architectural overview.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop clusters with Kove® XPD™ persistent memory.txt"); doc_dir_ls.add(doc_dir + + * "Hadoop HDFS\\HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt"); doc_dir_ls.add(doc_dir + + * "Hadoop HDFS\\Hadoop Distributed File System (HDFS) Architecture – A Guide to HDFS for Every Data Engineer.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HADOOP ECOSYSTEM.txt"); + * doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Hadoop HDFS Architecture Explanation and Assumptions.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HDFS Architecture Guide.txt"); doc_dir_ls.add(doc_dir + + * "Hadoop HDFS\\HDFS Architecture.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\HDFS.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Key Design of HDFS Architecture.txt"); doc_dir_ls.add(doc_dir + + * "Hadoop HDFS\\The Hadoop Distributed File System Architecture and Design.txt"); doc_dir_ls.add(doc_dir + "Hadoop HDFS\\Towards A Scalable HDFS Architecture.txt"); + */ /* * Hadoop MapReduce */ + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Apache Hadoop Architecture – HDFS, YARN & MapReduce.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Big Data Analysis Challenges and Solutions.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Big Data Management on Wireless Sensor Networks.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Hadoop - MapReduce.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Hadoop Architecture in Detail – HDFS, Yarn & MapReduce.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Hadoop MapReduce- Java-based Processing Framework for Big Data.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce – Components.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Architecture1.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Architecture2.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Architecture3.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Tutorial.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce Working and Components.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\MapReduce.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\mapreduce_hadoop2.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\Understanding MapReduce in Hadoop.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\What are the components of MapReduce.txt"); + doc_dir_ls.add(doc_dir + "Hadoop MapReduce\\What Is MapReduce Architecture An Important Overview For 2021.txt"); // 实际使用的Map,保存每份文档地址及其内容 Map> dir_sentences_map = DocReader.readDocs(doc_dir_ls); /* @@ -85,10 +113,16 @@ public class CDToCodeTracer { * * <类全称(包+类名), java_file_path> */ - // 记得改这里面写的路径 - Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path"); + // 记得改这里的路径 + // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-fm"); + // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-hdfs"); + Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-mr"); // 检查结果,可注释掉 // CodeReader.check(classFullName_javaFileDir_map); + // 针对每个图中的类名,如果代码中有相似的包名,则将包下所有相似的类(字符串包含至少一个关键词即可)都重点标记。此脚本为一次性,使用后删除即可 + // Set packs = CodeReader.findPcks(code_dir + "code path-fm"); + // Set packs = CodeReader.findPcks(code_dir + "code path-hdfs"); + // Set packs = CodeReader.findPcks(code_dir + "code path-mr"); /* * 4、分析文档信息。实际相当于增加类图中的UMLclass、类本身的内容、类之间关系 * @@ -112,7 +146,7 @@ public class CDToCodeTracer { /* * Apache OODT File Manager */ - CodeRelationMapper.save(mapped_classes, res_dir + "Apache OODT File Manager.xls"); + // CodeRelationMapper.save(mapped_classes, res_dir + "Apache OODT File Manager.xls"); /* * Hadoop HDFS */ @@ -120,7 +154,7 @@ public class CDToCodeTracer { /* * Hadoop MapReduce */ - // CodeRelationMapper.save(mapped_classes, res_dir + "Hadoop MapReduce.xls"); + CodeRelationMapper.save(mapped_classes, res_dir + "Hadoop MapReduce.xls"); // 检查结果,可注释掉 // CodeRelationMapper.check(res_dir + "Apache OODT File Manager.xls"); } diff --git a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java b/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java deleted file mode 100644 index d35e4514e49f52499d743ceabca69c7625f2ef08..0000000000000000000000000000000000000000 --- a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.hy.java.uct.cdtocode; - -public class VSMAndLSIDataGenerator { - - public static void main(String[] args) { - // TODO Auto-generated method stub - - } -} diff --git a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java index 58514846ce2edd48b1769bbf4e1ef59771aef6db..bd1a803761c961f7a011c3c86e687cef4ad212c8 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java +++ b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java @@ -3,6 +3,7 @@ package com.hy.java.uct.cdtocode.mapper; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -41,6 +42,17 @@ import jxl.write.biff.RowsExceededException; public class CodeRelationMapper { + // 文档实体与代码的相似度阈值。小项目0.6左右,大项目0.75左右 + private static final double sim_threshold = 0.75; + // 同一包下文档实体与代码的相似度阈值。小项目0.25左右,大项目0.65左右 + private static final double sim_p_threshold = 0.65; + // 包名相似度阈值。小项目0.85左右,大项目1.0左右 + private static final double sim_pck_threshold = 1.0; + // 文档实体平均追踪概率阈值。小项目0.35左右,大项目0.7~0.75左右 + private static final double ave_threshold = 0.75; + // 类图最终追踪概率阈值。小项目0.55左右,大项目0.75~0.8左右 + private static final double pre_threshold = 0.8; + /** * 对扩展后的模型与代码进行匹配 */ @@ -57,12 +69,16 @@ public class CodeRelationMapper { UMLClass UML_class = classShortName_classObj_mappedByDoc.get(ClsImg_shortName); // 对于每个UML_class,将其追踪到的每个Ent_doc,都追踪到代码(即Cls_code) for (Entity mappedEnt : UML_class.mappedEnt_ls) { - // 先将Ent_doc追踪到Cls_code。记录每个追踪到的java文件的类全称,以及追踪概率Sim(Ent_doc,Cls_code) - mapEntToCode(mappedEnt, classFullName_javaFileDir_map); - // 然后对Ent_doc的每个related_Ent,同样将其追踪到Cls_code - for (EntityRelation relation : mappedEnt.relations_between_Ent) { - Entity related_Ent = relation.related_ent; - mapEntToCode(related_Ent, classFullName_javaFileDir_map); + if (mappedEnt.sim_ClsImg_EntDoc == -1.0) { + mapEntToCode(mappedEnt, classFullName_javaFileDir_map, true); + } else { + // 先将Ent_doc追踪到Cls_code。记录每个追踪到的java文件的类全称,以及追踪概率Sim(Ent_doc,Cls_code) + mapEntToCode(mappedEnt, classFullName_javaFileDir_map, false); + // 然后对Ent_doc的每个related_Ent,同样将其追踪到Cls_code + for (EntityRelation relation : mappedEnt.relations_between_Ent) { + Entity related_Ent = relation.related_ent; + mapEntToCode(related_Ent, classFullName_javaFileDir_map, false); + } } } System.out.println("对" + UML_class.getTitle() + "的模糊匹配完成。匹配完所有类后,还需基于关系推理取舍追踪"); @@ -90,12 +106,14 @@ public class CodeRelationMapper { /* * 最终两部分概率合并即得图中每个类的追踪推荐列表(如果太多,则取前5,或者取概率>50%的) * - * P(Cls_img,Cls_code)=PE_doc(Cls_img,Ent_doc)×PE_code(Ent_doc,Cls_code,R) + * P(Cls_img,Cls_code,R)=PE_doc(Cls_img,Ent_doc)×PE_code(Ent_doc,Cls_code,R) * * 注意:目前,Cls_img→数个Ent_doc→数个Cls_code(每个都带着数个PEcode_R) */ getInitTraceResults(classShortName_classObj_mappedByDoc); // 此时UML_class.duplicated_mapped_javaFile_ls可能包含重复结果,所以还需过滤一遍 + mergeDuplicatedResults(classShortName_classObj_mappedByDoc); + // 过滤完之后,得到的是P(Cls_img,Cls_code,R),所以将其合并得到的P(Cls_img,Cls_code)。Cls_img→数个Cls_code filterTraceResults(classShortName_classObj_mappedByDoc); System.out.println("完成对扩展后的模型与代码进行匹配"); return classShortName_classObj_mappedByDoc; @@ -103,30 +121,55 @@ public class CodeRelationMapper { /** * 将Ent_doc追踪到Cls_code。记录每个追踪到的java文件的类全称,以及追踪概率Sim(Ent_doc,Cls_code) + * + * @param is_ClsName */ - private static void mapEntToCode(Entity Ent_doc, Map classFullName_javaFileDir_map) { + private static void mapEntToCode(Entity Ent_doc, Map classFullName_javaFileDir_map, boolean is_ClsName) { Set ClsCode_fullName_set = classFullName_javaFileDir_map.keySet(); - // 遍历一遍全体java文件,找所有与Ent_doc相似的Cls_code - for (String ClsCode_fullName : ClsCode_fullName_set) { - // 对每个java文件对应的full name,获取其short name - String ClsCode_shortName = getClsShortNameFromFullName(ClsCode_fullName); - // 如果Ent_doc.name与Cls_code的short name相似,再进行属性、方法的比较,否则直接pass - if (EntName_SimilarWith_ClsShortName(Ent_doc.name, ClsCode_shortName)) { - // 此时Ent_doc.name与Cls_code的short name相似。暂时记录名称相似度 - double name_similarity = sim_EntDocName_ClsCodeName(Ent_doc.name, ClsCode_shortName); - // 然后进行属性、方法的比较 - try { - CompilationUnit Cls_code = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(ClsCode_fullName))); - // 判断Ent_doc与Cls_code的属性、方法是否相似(即:Ent_doc的属性、方法完全被包含于Cls_code中) - Pair attriMethod_similarity_pair = EntAttriMethod_SimilarWith_ClsAttriMethod(Ent_doc, Cls_code, ClsCode_shortName); - // 如果Ent_doc与Cls_code的属性、方法相似,则将其追踪过去 - if (attriMethod_similarity_pair.getLeft() == true) { - // 追踪时记录Cls_code的full name,便于在map里查找 - // 追踪概率Sim(Ent_doc,Cls_code)=名称相似度*属性方法相似度 - Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, name_similarity * attriMethod_similarity_pair.getRight())); + if (is_ClsName) { + for (String ClsCode_fullName : ClsCode_fullName_set) { + // 对每个java文件对应的full name,获取其short name + String ClsCode_shortName = getClsShortNameFromFullName(ClsCode_fullName); + // 如果Ent_doc.name与Cls_code的short name相似,再进行属性、方法的比较,否则直接pass + if (EntName_SimilarWith_ClsShortName(Ent_doc.name.substring(6), ClsCode_shortName, 1.0)) { + Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, 1.0)); + } + } + } else { + // 遍历一遍全体java文件,找所有与Ent_doc相似的Cls_code + for (String ClsCode_fullName : ClsCode_fullName_set) { + // 对每个java文件对应的full name,获取其short name + String ClsCode_shortName = getClsShortNameFromFullName(ClsCode_fullName); + String ClsCode_parentPackage = getParentPackageFromFullName(ClsCode_fullName); + // 如果Ent_doc.name与Cls_code的short name相似,再进行属性、方法的比较,否则直接pass + // 此处的阈值会影响准确率。建议在0.5左右 + if (EntName_SimilarWith_ClsShortName(Ent_doc.name, ClsCode_shortName, sim_threshold)) { + // 此时Ent_doc.name与Cls_code的short name相似。暂时记录名称相似度 + double name_similarity = sim_EntDocName_ClsCodeName(Ent_doc.name, ClsCode_shortName); + if (name_similarity == 1.0) { + Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, name_similarity)); + } else { + // 然后进行属性、方法的比较 + try { + CompilationUnit Cls_code = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(ClsCode_fullName))); + // 判断Ent_doc与Cls_code的属性、方法是否相似(即:Ent_doc的属性、方法完全被包含于Cls_code中) + Pair attriMethod_similarity_pair = EntAttriMethod_SimilarWith_ClsAttriMethod(Ent_doc, Cls_code, ClsCode_shortName); + // 如果Ent_doc与Cls_code的属性、方法相似,则将其追踪过去 + if (attriMethod_similarity_pair.getLeft() == true) { + // 追踪时记录Cls_code的full name,便于在map里查找 + // 追踪概率Sim(Ent_doc,Cls_code)=名称相似度*属性方法相似度 + Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, name_similarity * attriMethod_similarity_pair.getRight())); + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + } + } else if (EntName_SimilarWith_ClsShortName(Ent_doc.name.toLowerCase(), ClsCode_parentPackage.toLowerCase(), sim_pck_threshold)) { + // 此时Ent_doc.name与Cls_code的父包相似。暂时记录名称相似度 + double name_similarity = sim_EntDocName_ClsCodeName(Ent_doc.name, ClsCode_fullName); + if (name_similarity >= sim_p_threshold) { + Ent_doc.possibleMapped_javaFiles.add(Pair.createPair(ClsCode_fullName, name_similarity)); } - } catch (FileNotFoundException e) { - e.printStackTrace(); } } } @@ -139,10 +182,18 @@ public class CodeRelationMapper { return clsCode_fullName.substring(clsCode_fullName.lastIndexOf(".") + 1); } + /** + * 通过类全称,获得类的父包 + */ + private static String getParentPackageFromFullName(String clsCode_fullName) { + String temp = clsCode_fullName.substring(0, clsCode_fullName.lastIndexOf(".")); + return temp.substring(temp.lastIndexOf(".") + 1); + } + /** * 对每个UML_class,根据其title,找java_files中类全称的类名与其相似的java文件 */ - private static boolean EntName_SimilarWith_ClsShortName(String name1, String name2) { + private static boolean EntName_SimilarWith_ClsShortName(String name1, String name2, double threshold) { // 如果两个String相等,则返回true if (name1.equals(name2)) { return true; @@ -153,7 +204,7 @@ public class CodeRelationMapper { String name1_lower = name1.toLowerCase(); String name2_lower = name2.toLowerCase(); // 判断处理后的两个String是否相似 - if (sim_EntDocName_ClsCodeName(name1_lower, name2_lower) > 0.5) { + if (sim_EntDocName_ClsCodeName(name1_lower, name2_lower) > threshold) { return true; } } @@ -333,42 +384,54 @@ public class CodeRelationMapper { // 对每个Ent_doc,查其所有追踪到的Cls_code是否与Ent_doc.related_Ent有关系 for (Entity Ent_doc : UML_class.mappedEnt_ls) { for (Pair possibleMapped_javaFile_pair : Ent_doc.possibleMapped_javaFiles) { - try { - CompilationUnit possibleMapped_ClsCode = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()))); - String cls_shortName = getClsShortNameFromFullName(possibleMapped_javaFile_pair.getLeft()); - /* - * 如果一个Cls_code与某个由related_Ent追踪到的Cls_code'有同类关系,则该Cls_code就是“根据关系推理追踪到的代码文件”。 - * - * 注意: 如果Ent_code没有related_Ent,则直接认为Ent_code能纯粹根据文本相似度追踪到Cls_code - */ - Pair> ifTrulyMapped_possibleRs = check_if_trulyMapped(possibleMapped_ClsCode, cls_shortName, possibleMapped_javaFile_pair.getLeft(), Ent_doc.relations_between_Ent); - // 如果确实真的可以这么追踪,则记录这个文件,并计算追踪概率 - if (ifTrulyMapped_possibleRs.getLeft() == true) { - // 记录这个“根据关系推理追踪到的代码文件”。 - MappedFile truly_mapped_file_forEnt = new MappedFile(); - truly_mapped_file_forEnt.java_file_dir = classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()); - // 记录在每条关系R下从Ent_doc追踪到Cls_code的概率PEcode_R - if (ifTrulyMapped_possibleRs.getRight().size() <= 0) { - // PE_code(Ent_doc,Cls_code,R) - // 如果Ent_code能纯粹根据文本相似度追踪到Cls_code,则将PEcode_R中的PR_doc部分置为默认值1 - double PEcode_R = 0.5 * possibleMapped_javaFile_pair.getRight() + 0.5 * 1.0; - truly_mapped_file_forEnt.PEcode_R_ls.add(Pair.createPair(null, PEcode_R)); - } else { + if (possibleMapped_javaFile_pair.getRight() == 1.0) { + // 记录这个“根据关系推理追踪到的代码文件”。 + MappedFile truly_mapped_file_forEnt = new MappedFile(); + truly_mapped_file_forEnt.java_file_dir = classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()); + // PE_code(Ent_doc,Cls_code,R) + // 如果Ent_code能纯粹根据文本相似度追踪到Cls_code,则将PEcode_R中的PR_doc部分置为默认值1 + double PEcode_R = 0.5 * possibleMapped_javaFile_pair.getRight() + 0.5 * 1.0; + truly_mapped_file_forEnt.PEcode_R_ls.add(Pair.createPair(null, PEcode_R)); + Ent_doc.trulyMapped_javaFiles.add(truly_mapped_file_forEnt); + } else { + try { + CompilationUnit possibleMapped_ClsCode = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()))); + String cls_shortName = getClsShortNameFromFullName(possibleMapped_javaFile_pair.getLeft()); + /* + * 如果一个Cls_code与某个由related_Ent追踪到的Cls_code'有同类关系,则该Cls_code就是“根据关系推理追踪到的代码文件”。 + * + * 注意: 如果Ent_code没有related_Ent,则直接认为Ent_code能纯粹根据文本相似度追踪到Cls_code + */ + Pair> ifTrulyMapped_possibleRs = check_if_trulyMapped(possibleMapped_ClsCode, cls_shortName, possibleMapped_javaFile_pair.getLeft(), Ent_doc.relations_between_Ent); + // 如果确实真的可以这么追踪,则记录这个文件,并计算追踪概率 + if (ifTrulyMapped_possibleRs.getLeft() == true) { + // 记录这个“根据关系推理追踪到的代码文件”。 + MappedFile truly_mapped_file_forEnt = new MappedFile(); + truly_mapped_file_forEnt.java_file_dir = classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()); // 记录在每条关系R下从Ent_doc追踪到Cls_code的概率PEcode_R - for (EntityRelation R : ifTrulyMapped_possibleRs.getRight()) { + if (ifTrulyMapped_possibleRs.getRight().size() <= 0) { // PE_code(Ent_doc,Cls_code,R) - double PEcode_R = 0.5 * possibleMapped_javaFile_pair.getRight() + 0.5 * R.PR_doc; - truly_mapped_file_forEnt.PEcode_R_ls.add(Pair.createPair(R, PEcode_R)); + // 如果Ent_code能纯粹根据文本相似度追踪到Cls_code,则将PEcode_R中的PR_doc部分置为默认值1 + double PEcode_R = 0.5 * possibleMapped_javaFile_pair.getRight() + 0.5 * 1.0; + truly_mapped_file_forEnt.PEcode_R_ls.add(Pair.createPair(null, PEcode_R)); + } else { + // 记录在每条关系R下从Ent_doc追踪到Cls_code的概率PEcode_R + for (EntityRelation R : ifTrulyMapped_possibleRs.getRight()) { + // PE_code(Ent_doc,Cls_code,R) + double PEcode_R = 0.5 * possibleMapped_javaFile_pair.getRight() + 0.5 * R.PR_doc; + truly_mapped_file_forEnt.PEcode_R_ls.add(Pair.createPair(R, PEcode_R)); + } } + Ent_doc.trulyMapped_javaFiles.add(truly_mapped_file_forEnt); + // 进行到这儿后,对每个Ent_doc而言,会将其追踪到数个truly_mapped_file,每个truly_mapped_file均对应数个PE_code(Ent_doc,Cls_code,R) } - Ent_doc.trulyMapped_javaFiles.add(truly_mapped_file_forEnt); - // 进行到这儿后,对每个Ent_doc而言,会将其追踪到数个truly_mapped_file,每个truly_mapped_file均对应数个PE_code(Ent_doc,Cls_code,R) + } catch (FileNotFoundException e) { + e.printStackTrace(); } - } catch (FileNotFoundException e) { - e.printStackTrace(); } } } + System.out.println("已推理完" + UML_class.getTitle()); } } @@ -605,9 +668,16 @@ public class CodeRelationMapper { for (Pair PEcode_R : trulyMapped_javaFile_forEnt.PEcode_R_ls) { // P(Cls_img,Cls_code)=PE_doc(Cls_img,Ent_doc)×PE_code(Ent_doc,Cls_code,R) double P_ClsImg_ClsCode = Ent_doc.PE_doc * PEcode_R.getRight(); - trulyMapped_javaFile_forCls.P_ls.add(Pair.createPair(PEcode_R.getLeft(), P_ClsImg_ClsCode)); + // 从文件名分析出类名 + String fileName = trulyMapped_javaFile_forCls.java_file_dir.substring(trulyMapped_javaFile_forCls.java_file_dir.lastIndexOf("\\") + 1, trulyMapped_javaFile_forCls.java_file_dir.lastIndexOf(".")); + // 如果完全一样,则必定100%追踪 + if (UML_class.getTitle().equals(fileName)) { + trulyMapped_javaFile_forCls.PCCr_ls.add(Pair.createPair(PEcode_R.getLeft(), 1.0)); + } else { + trulyMapped_javaFile_forCls.PCCr_ls.add(Pair.createPair(PEcode_R.getLeft(), P_ClsImg_ClsCode)); + } } - UML_class.duplicated_mapped_javaFile_ls.add(trulyMapped_javaFile_forCls); + UML_class.duplicated_mappedJavaFiles_underDiffRelations.add(trulyMapped_javaFile_forCls); } } } @@ -616,13 +686,13 @@ public class CodeRelationMapper { /** * 过滤追踪结果,使其无重复 */ - private static void filterTraceResults(Map classShortName_classObj_mappedByDoc) { + private static void mergeDuplicatedResults(Map classShortName_classObj_mappedByDoc) { Set ClsImg_shortName_set = classShortName_classObj_mappedByDoc.keySet(); // 开始过滤 for (String ClsImg_shortName : ClsImg_shortName_set) { UMLClass UML_class = classShortName_classObj_mappedByDoc.get(ClsImg_shortName); Map temp = new HashMap<>(); - for (MappedFile mapped_javaFile_inCls : UML_class.duplicated_mapped_javaFile_ls) { + for (MappedFile mapped_javaFile_inCls : UML_class.duplicated_mappedJavaFiles_underDiffRelations) { // 首先,先对mapped_javaFile_inCls自己追踪到的javaFile去重 filterMapped_javaFile_inCls(mapped_javaFile_inCls); // 其次,再进行两两比对 @@ -637,7 +707,7 @@ public class CodeRelationMapper { // 过滤完毕,将结果保存在UML_class.mapped_javaFile_ls中 Set mappedFileDir_set = temp.keySet(); for (String mappedFileDir : mappedFileDir_set) { - UML_class.mapped_javaFile_ls.add(temp.get(mappedFileDir)); + UML_class.mappedJavaFiles_underDiffRelations.add(temp.get(mappedFileDir)); } } } @@ -649,7 +719,7 @@ public class CodeRelationMapper { Map>> cluster_map = new HashMap<>(); Set> Ps = new HashSet<>(); // 先分堆 - for (Pair P : mapped_javaFile_inCls.P_ls) { + for (Pair P : mapped_javaFile_inCls.PCCr_ls) { if (P.getLeft() == null) { if (cluster_map.containsKey("这是null")) { cluster_map.get("这是null").add(P); @@ -680,9 +750,9 @@ public class CodeRelationMapper { Ps.add(P); } // 然后将res_map转存到mapped_javaFile_inCls.P_ls中 - mapped_javaFile_inCls.P_ls.clear(); + mapped_javaFile_inCls.PCCr_ls.clear(); for (Pair P : Ps) { - mapped_javaFile_inCls.P_ls.add(P); + mapped_javaFile_inCls.PCCr_ls.add(P); } } @@ -693,12 +763,20 @@ public class CodeRelationMapper { MappedFile res = new MappedFile(); res.java_file_dir = mapped_javaFile_inTemp.java_file_dir; // 开始合并两个MappedFile的关系 - for (Pair P_inTemp : mapped_javaFile_inTemp.P_ls) { + for (Pair P_inTemp : mapped_javaFile_inTemp.PCCr_ls) { Pair P_inRes = Pair.createPair(P_inTemp.getLeft(), P_inTemp.getRight()); // 接下来的比较类似排序:比较P与mapped_javaFile_inCls的P_inCls,如果两者具有相同的关系,则取概率大的 - for (Pair P_inCls : mapped_javaFile_inCls.P_ls) { + for (Pair P_inCls : mapped_javaFile_inCls.PCCr_ls) { if (P_inRes.getLeft() != null && P_inCls.getLeft() != null) { - if (P_inRes.getLeft().related_ent.name.equals(P_inCls.getLeft().related_ent.name) && P_inRes.getLeft().relation_type.equals(P_inCls.getLeft().relation_type)) { + if (P_inCls.getLeft().related_ent != null) { + if (P_inRes.getLeft().related_ent.name.equals(P_inCls.getLeft().related_ent.name) && P_inRes.getLeft().relation_type.equals(P_inCls.getLeft().relation_type)) { + P_inCls.getLeft().should_be_del = true; + // 取概率大的 + if (P_inRes.getRight() < P_inCls.getRight()) { + P_inRes.setRight(P_inCls.getRight()); + } + } + } else { P_inCls.getLeft().should_be_del = true; // 取概率大的 if (P_inRes.getRight() < P_inCls.getRight()) { @@ -716,19 +794,19 @@ public class CodeRelationMapper { } } } - res.P_ls.add(P_inRes); + res.PCCr_ls.add(P_inRes); } // 然后再把mapped_javaFile_inCls里没检测到的P_inCls都添加上 - for (Pair P_inCls : mapped_javaFile_inCls.P_ls) { + for (Pair P_inCls : mapped_javaFile_inCls.PCCr_ls) { if (P_inCls.getLeft() == null) { // 这时如果还是null,说明P_inTemp的P全都不是null。所以P_inCls的null的P都未被检测过,因此直接添加P_inCls的这个P - res.P_ls.add(P_inCls); + res.PCCr_ls.add(P_inCls); } else if (!P_inCls.getLeft().should_be_del) { - res.P_ls.add(P_inCls); + res.PCCr_ls.add(P_inCls); } } // 最后过滤一下res,把假的EntityRelation都删掉 - for (Pair P : res.P_ls) { + for (Pair P : res.PCCr_ls) { if (P.getLeft() != null) { if (P.getLeft().related_ent == null) { P.setLeft(null); @@ -738,6 +816,36 @@ public class CodeRelationMapper { return res; } + /** + * 将P(Cls_img,Cls_code,R)合并得到的P(Cls_img,Cls_code)。Cls_img→数个Cls_code + * + * 连乘式合并:P(Cls_img,Cls_code)=1-π(1-P(Cls_img,Cls_code,R)) + */ + private static void filterTraceResults(Map classShortName_classObj_mappedByDoc) { + Set ClsImg_shortName_set = classShortName_classObj_mappedByDoc.keySet(); + for (String ClsImg_shortName : ClsImg_shortName_set) { + UMLClass UML_class = classShortName_classObj_mappedByDoc.get(ClsImg_shortName); + // 对每个Cls_code,合并其所有P(Cls_img,Cls_code,R),成为一个P(Cls_img,Cls_code)存在UML_class.mappedJavaFiles中 + for (MappedFile mapped_javaFile : UML_class.mappedJavaFiles_underDiffRelations) { + BigDecimal pai = new BigDecimal(1.0); + double sum = 0.0; + // 计算P(Cls_img,Cls_code)=1-π(1-P(Cls_img,Cls_code,R)) + for (Pair PCCr : mapped_javaFile.PCCr_ls) { + pai = pai.multiply(new BigDecimal(1.0 - PCCr.getRight())); + sum += PCCr.getRight(); + } + mapped_javaFile.P = (new BigDecimal(1.0)).subtract(pai).doubleValue(); + double ave = sum / mapped_javaFile.PCCr_ls.size(); + // 此处的阈值会影响准确率。建议在0.3左右 + if (mapped_javaFile.P == 1.0) { + UML_class.mappedJavaFiles.add(mapped_javaFile); + } else if (mapped_javaFile.P > pre_threshold && ave > ave_threshold) { + UML_class.mappedJavaFiles.add(mapped_javaFile); + } + } + } + } + /** * 保存追踪结果 * @@ -749,83 +857,30 @@ public class CodeRelationMapper { WritableWorkbook workbook = Workbook.createWorkbook(new File(res_dir)); if (workbook != null) { // 新建第一个工作表 - WritableSheet sheets = workbook.createSheet("Sheet1", 0); + WritableSheet sheet1 = workbook.createSheet("Sheet1", 0); // 构建工作表的表头 - Label label1 = new Label(0, 0, "类"); - sheets.addCell(label1); - Label label2 = new Label(1, 0, "追踪到的代码"); - sheets.addCell(label2); - Label label3 = new Label(2, 0, "追踪概率"); - sheets.addCell(label3); - Label label4 = new Label(3, 0, "参考关系类型"); - sheets.addCell(label4); - Label label5 = new Label(4, 0, "参考关系的目标"); - sheets.addCell(label5); + sheet1.addCell(new Label(0, 0, "类")); + sheet1.addCell(new Label(1, 0, "追踪到的代码")); + sheet1.addCell(new Label(2, 0, "追踪概率")); + // 新建第二个工作表 + WritableSheet sheet2 = workbook.createSheet("Sheet2", 1); + // 构建工作表的表头 + sheet2.addCell(new Label(0, 0, "类")); + sheet2.addCell(new Label(1, 0, "追踪到的代码")); + sheet2.addCell(new Label(2, 0, "追踪概率")); // 从第二行开始,保存每个类的追踪结果 Set ClsImg_shortName_set = mapped_classes.keySet(); int row = 1; for (String ClsImg_shortName : ClsImg_shortName_set) { UMLClass UML_class = mapped_classes.get(ClsImg_shortName); - // 保存追踪结果 - if (UML_class.mapped_javaFile_ls.size() > 0) { - for (MappedFile mapped_javaFile : UML_class.mapped_javaFile_ls) { - for (Pair P : mapped_javaFile.P_ls) { - if (P.getLeft() != null) { - // 类 - Label _class = new Label(0, row, UML_class.getTitle()); - sheets.addCell(_class); - // 追踪到的代码 - Label code = new Label(1, row, mapped_javaFile.java_file_dir); - sheets.addCell(code); - // 追踪概率 - Label ratio = new Label(2, row, P.getRight().toString()); - sheets.addCell(ratio); - // 参考关系类型 - Label ref_relation = new Label(3, row, P.getLeft().relation_type); - sheets.addCell(ref_relation); - // 参考关系的目标 - Label ref_relation_target = new Label(4, row, P.getLeft().related_ent.name); - sheets.addCell(ref_relation_target); - row++; - } else { - // 类 - Label _class = new Label(0, row, UML_class.getTitle()); - sheets.addCell(_class); - // 追踪到的代码 - Label code = new Label(1, row, mapped_javaFile.java_file_dir); - sheets.addCell(code); - // 追踪概率 - Label ratio = new Label(2, row, P.getRight().toString()); - sheets.addCell(ratio); - // 参考关系类型 - Label ref_relation = new Label(3, row, null); - sheets.addCell(ref_relation); - // 参考关系的目标 - Label ref_relation_target = new Label(4, row, null); - sheets.addCell(ref_relation_target); - row++; - } - } - } + // 其实这里可以做些筛选 + // 前65534个存在第一个表里 + if (row < 65535) { + row = saveTrace(sheet1, row, UML_class); } - // 当前类没有追踪到对应的代码实现 + // 从65535开始,后面的存在第二个表里 else { - // 类 - Label _class = new Label(0, row, UML_class.getTitle()); - sheets.addCell(_class); - // 追踪到的代码 - Label code = new Label(1, row, null); - sheets.addCell(code); - // 追踪概率 - Label ratio = new Label(2, row, null); - sheets.addCell(ratio); - // 参考关系类型 - Label ref_relation = new Label(3, row, null); - sheets.addCell(ref_relation); - // 参考关系的目标 - Label ref_relation_target = new Label(4, row, null); - sheets.addCell(ref_relation_target); - row++; + row = saveTrace(sheet2, row - 65534, UML_class); } } // 写入文件 @@ -844,6 +899,49 @@ public class CodeRelationMapper { } } + private static int saveTrace(WritableSheet sheet, int row, UMLClass UML_class) { + // 保存追踪结果 + try { + if (UML_class.mappedJavaFiles.size() > 0) { + for (MappedFile mapped_javaFile : UML_class.mappedJavaFiles) { + // 类 + sheet.addCell(new Label(0, row, UML_class.getTitle())); + // 追踪到的代码 + sheet.addCell(new Label(1, row, mapped_javaFile.java_file_dir)); + // 追踪概率 + double ratio = mapped_javaFile.P; + if (ratio == 1.0) { + double temp = sim_EntDocName_ClsCodeName(UML_class.getTitle(), mapped_javaFile.java_file_dir.substring(mapped_javaFile.java_file_dir.lastIndexOf("\\") + 1, mapped_javaFile.java_file_dir.lastIndexOf("."))); + if (temp > 0.5) { + ratio = temp; + } else { + ratio = 0.5 + temp; + } + } + sheet.addCell(new Label(2, row, Double.toString(ratio))); + row++; + } + } + // 当前类没有追踪到对应的代码实现 + else { + // 类 + sheet.addCell(new Label(0, row, UML_class.getTitle())); + // 追踪到的代码 + sheet.addCell(new Label(1, row, null)); + // 追踪概率 + sheet.addCell(new Label(2, row, null)); + row++; + } + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return row; + } + /** * 检查追踪结果 */ @@ -859,14 +957,8 @@ public class CodeRelationMapper { String _class = sheet.getCell(0, row).getContents(); String code = sheet.getCell(1, row).getContents(); String ratio = sheet.getCell(2, row).getContents(); - String ref_relation = sheet.getCell(3, row).getContents(); - String ref_relation_target = sheet.getCell(4, row).getContents(); if (code != null) { - if (ref_relation != null) { - System.out.println(_class + "有" + ratio + "的概率追踪到代码中的" + code + ",参考其与文档实体" + ref_relation_target + "的" + ref_relation + "关系。"); - } else { - System.out.println(_class + "有" + ratio + "的概率追踪到代码中的" + code + ",这条追踪是没有相关关系的。"); - } + System.out.println(_class + "有" + ratio + "的概率追踪到代码中的" + code); } else { System.out.println(_class + "没有追踪到对应的代码实现。"); } diff --git a/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java b/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java index c8e2b8b20c794b941f4780a1e6efcd1da3c9b924..26ba29eb0e13b0192699fb42bcb23f1591c5f27a 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java +++ b/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java @@ -82,7 +82,7 @@ public class DocAnalyzer { // 对每个相似词文件,遍历其中的<原词,相似词>对,以原词为key保存相似词。相似度为Jaccard相似度 for (String doc_dir : docDir_set) { // 处理一下赵子岩的文件,格式化为"原词,相似词"对 - processSimDoc(doc_dir); + processSimDoc(doc_dir, classShortName_set); FileEditor simEnts_doc = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-simEnts.txt"); List similar_names_raw = simEnts_doc.readLines(); if (similar_names_raw != null) { @@ -91,7 +91,16 @@ public class DocAnalyzer { // source_target[0]是原词,source_target[1]是相似词 String[] source_target = clsName_similarName.split(","); if (clsImg_EntDoc_justZhao.containsKey(source_target[0])) { - clsImg_EntDoc_justZhao.get(source_target[0]).add(Pair.createPair(source_target[1], sim_ClsImg_EntDoc(source_target[0], source_target[1]))); + // 从API文档中获取的体系结构信息是最准确的 + if (source_target[1].contains("class ")) { + if (source_target[1].substring(6) != null) { + if (Character.isUpperCase(source_target[1].substring(6).charAt(0))) { + clsImg_EntDoc_justZhao.get(source_target[0]).add(Pair.createPair(source_target[1], -1.0)); + } + } + } else { + clsImg_EntDoc_justZhao.get(source_target[0]).add(Pair.createPair(source_target[1], sim_ClsImg_EntDoc(source_target[0], source_target[1]))); + } } } } @@ -174,7 +183,14 @@ public class DocAnalyzer { for (String class_short_name : classShortName_set) { UMLClass Cls_img = classes_in_CD.get(class_short_name); for (Entity Ent_doc : Cls_img.mappedEnt_ls) { - Ent_doc.PE_doc = Ent_doc.Fn_Ndoc * Ent_doc.sim_ClsImg_EntDoc; + if (Ent_doc.sim_ClsImg_EntDoc == 1.0) { + Ent_doc.PE_doc = 1.0; + } else if (Ent_doc.sim_ClsImg_EntDoc == -1.0) { + Ent_doc.PE_doc = 1.0; + Ent_doc.sim_ClsImg_EntDoc = -1.0; + } else { + Ent_doc.PE_doc = Ent_doc.Fn_Ndoc * Ent_doc.sim_ClsImg_EntDoc; + } } } /* @@ -214,8 +230,10 @@ public class DocAnalyzer { * 每一行都应该是:原词,相似词 * * 其中,原词和相似词都不能有空格(即应该是类名) + * + * @param classShortName_set */ - private static void processSimDoc(String doc_dir) { + private static void processSimDoc(String doc_dir, Set classShortName_set) { FileEditor res_fe = null; // 找一下同目录下的赵子岩的结果,然后再格式化 FileEditor ziyan_fe = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-ziyan.txt"); @@ -227,11 +245,51 @@ public class DocAnalyzer { String[] source_sym_pair = ziyan_line.split(","); // source的词得去空格 String source = source_sym_pair[0].replaceAll(" ", ""); - // sym的词得根据其大小写判断是去空格还是保留空格 - String sym = processSym(source_sym_pair[1]); - res_fe.write(source + "," + sym + "\n", true); + if (classShortName_set.contains(source)) { + // sym的词得根据其大小写判断是去空格还是保留空格 + String sym = processClsName(source_sym_pair[1]); + res_fe.write(source + "," + sym + "\n", true); + } else { + // sym的词得根据其大小写判断是去空格还是保留空格 + String sym = processSym(source_sym_pair[1]); + res_fe.write(source + "," + sym + "\n", true); + } + } + } + } + + private static String processClsName(String string) { + String res = string.trim(); + // 检查res到底是类名拆开了(temp的每个词都是大写开头)、还是本身就是一个词组(temp中存在小写开头的词) + String[] temp = res.split(" "); + boolean is_s_clsName = true; + if (temp[0].equals("class")) { + for (int i = 1; i < temp.length; i++) { + if (Character.isLowerCase(temp[i].charAt(0))) { + is_s_clsName = false; + break; + } } + } else { + is_s_clsName = false; } + if (is_s_clsName) { + res = "class " + res.replaceAll(" ", "").substring(5); + } else { + boolean is_clsName = true; + // 看看temp中是否存在小写开头的词 + for (String word : temp) { + if (Character.isLowerCase(word.charAt(0))) { + is_clsName = false; + break; + } + } + // 如果res是类名,则需去空格。否则,直接返回原res即可 + if (is_clsName) { + res = res.replaceAll(" ", ""); + } + } + return res; } /** diff --git a/src/main/java/com/hy/java/uct/cdtocode/reader/CodeReader.java b/src/main/java/com/hy/java/uct/cdtocode/reader/CodeReader.java index 7ad057e94e7106d308002ef55e02e1ff5d082d49..4a96ad4e203384965f71e6b413616da7604a9620 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/reader/CodeReader.java +++ b/src/main/java/com/hy/java/uct/cdtocode/reader/CodeReader.java @@ -3,6 +3,7 @@ package com.hy.java.uct.cdtocode.reader; import java.io.File; import java.io.FileNotFoundException; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -43,8 +44,7 @@ public class CodeReader { CompilationUnit cu = StaticJavaParser.parse(new File(code_files_root.path)); Optional package_declaration = cu.getPackageDeclaration(); if (!package_declaration.isEmpty()) { - java_files.put(package_declaration.get().getNameAsString() + "." - + code_files_root.path.substring(code_files_root.path.lastIndexOf("\\") + 1, code_files_root.path.lastIndexOf(".")), code_files_root.path); + java_files.put(package_declaration.get().getNameAsString() + "." + code_files_root.path.substring(code_files_root.path.lastIndexOf("\\") + 1, code_files_root.path.lastIndexOf(".")), code_files_root.path); } } catch (FileNotFoundException e) { // TODO Auto-generated catch block @@ -68,4 +68,24 @@ public class CodeReader { System.out.println(class_name + "\t" + java_files.get(class_name)); } } + + /** + * 根据API文档,如果有包名与图中类名相似,则记录该包下所有的class + * + * 这是一次性方法,用完就注释掉 + */ + public static Set findPcks(String code_path_file_path) { + Set res = new HashSet<>(); + Map classFullName_javaFileDir_map = CodeReader.read(code_path_file_path); + Set clsFullNames = classFullName_javaFileDir_map.keySet(); + for (String clsFullName : clsFullNames) { + res.add(getParentPck(clsFullName)); + } + return res; + } + + private static String getParentPck(String clsFullName) { + String temp = clsFullName.substring(0, clsFullName.lastIndexOf(".")); + return temp.substring(temp.lastIndexOf(".") + 1); + } } diff --git a/src/main/java/com/hy/java/uct/cdtocode/util/MappedFile.java b/src/main/java/com/hy/java/uct/cdtocode/util/MappedFile.java index c4826c71d75e25b995ddaef2b0a9fee7ef838f16..526b0932c827b61e2cbdb9bc615e80ecd7cccf2b 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/util/MappedFile.java +++ b/src/main/java/com/hy/java/uct/cdtocode/util/MappedFile.java @@ -21,7 +21,13 @@ public class MappedFile { /** * 在关系R下从Cls_img追踪到Cls_code的概率 *

- * P(Cls_img,Cls_code)=PE_doc(Cls_img,Ent_doc)×PE_code(Ent_doc,Cls_code,R) + * P(Cls_img,Cls_code,R)=PE_doc(Cls_img,Ent_doc)×PE_code(Ent_doc,Cls_code,R) */ - public List> P_ls = new ArrayList<>(); + public List> PCCr_ls = new ArrayList<>(); + /** + * 从Cls_img追踪到Cls_code的概率 + *

+ * P(Cls_img,Cls_code)=1-π(1-P(Cls_img,Cls_code,R)) + */ + public double P = 0.0; } diff --git a/src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIDataGenerator.java b/src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIDataGenerator.java new file mode 100644 index 0000000000000000000000000000000000000000..de71004947d94b3977ec2d6ed97f7acc51a300f7 --- /dev/null +++ b/src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIDataGenerator.java @@ -0,0 +1,251 @@ +package com.hy.java.uct.cdtocode.vsmlsi; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +import com.github.javaparser.StaticJavaParser; +import com.github.javaparser.ast.CompilationUnit; +import com.github.javaparser.ast.NodeList; +import com.github.javaparser.ast.body.ClassOrInterfaceDeclaration; +import com.github.javaparser.ast.body.FieldDeclaration; +import com.github.javaparser.ast.body.MethodDeclaration; +import com.github.javaparser.ast.type.ClassOrInterfaceType; +import com.hy.java.uct.cdtocode.reader.CDReader; +import com.hy.java.uct.cdtocode.reader.CodeReader; +import com.hy.java.uct.cdtocode.reader.DocReader; +import com.hy.java.uct.util.UMLClass; +import com.hy.java.utility.common.FileEditor; +import com.hy.java.utility.common.Pair; + +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; +import jxl.write.Label; +import jxl.write.WritableSheet; +import jxl.write.WritableWorkbook; +import jxl.write.WriteException; +import jxl.write.biff.RowsExceededException; + +public class VSMAndLSIDataGenerator { + /** + * 将要追踪的类图放在cd_dir目录下 + */ + private static final String cd_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\cd\\"; + + /** + * 将设计文档放在doc_dir目录下 + */ + private static final String doc_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\doc\\"; + + /** + * 将要追踪的代码放在code_dir目录下 + */ + private static final String code_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\code\\"; + + /** + * 将追踪结果放在res_dir目录下 + */ + private static final String res_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\"; + + public static void main(String[] args) { + /* + * 1、读取模型信息 + */ + // 读取完UML图识别结果后,将实体信息保存在classes_in_CD里。形式为 + /* + * Apache OODT File Manager + */ + Map classes_in_CD = CDReader.read(cd_dir + "cd-Apache OODT File Manager.txt"); + /* + * Hadoop HDFS + */ + // Map classes_in_CD = CDReader.read(cd_dir + "cd-Hadoop HDFS.txt"); + /* + * Hadoop MapReduce + */ + // Map classes_in_CD = CDReader.read(cd_dir + "cd-Hadoop MapReduce.txt"); + // 检查结果,可注释掉 + // CDReader.check(classes_in_CD); + /* + * 2、读取code path指定的目录下所有java文件 + * + * <类全称(包+类名), java_file_path> + */ + // 记得改这里的路径 + Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-fm"); + // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-hdfs"); + // Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path-mr"); + // 检查结果,可注释掉 + // CodeReader.check(classFullName_javaFileDir_map); + /* + * 3、记录模型中的每个类、以及代码中的每个java文件。 + * + * 生成一个excel文件,每一行包含的是:类名 属性 方法 + */ + VSMAndLSIDataGenerator.save(classes_in_CD, classFullName_javaFileDir_map, res_dir + "Apache OODT File Manager-VSMLSIdata.xls"); + // VSMAndLSIDataGenerator.save(classes_in_CD, classFullName_javaFileDir_map, res_dir + "Hadoop HDFS-VSMLSIdata.xls"); + // VSMAndLSIDataGenerator.save(classes_in_CD, classFullName_javaFileDir_map, res_dir + "Hadoop MapReduce-VSMLSIdata.xls"); + /* + * 4、读取excel文件,生成可用数据 + */ + VSMAndLSIDataGenerator.generate(res_dir + "Apache OODT File Manager-VSMLSIdata.xls", res_dir + "Apache OODT File Manager-VSMLSI.txt"); + // VSMAndLSIDataGenerator.generate(res_dir + "Hadoop HDFS-VSMLSIdata.xls", res_dir + "Hadoop HDFS-VSMLSI.txt"); + // VSMAndLSIDataGenerator.generate(res_dir + "Hadoop MapReduce-VSMLSIdata.xls", res_dir + "Hadoop MapReduce-VSMLSI.txt"); + } + + /** + * 保存追踪结果 + * + * 类,追踪到的代码,追踪概率,参考关系,参考关系的目标 + * + * @param classFullName_javaFileDir_map + */ + public static void save(Map mapped_classes, Map classFullName_javaFileDir_map, String res_dir) { + try { + // 工作簿 + WritableWorkbook workbook = Workbook.createWorkbook(new File(res_dir)); + if (workbook != null) { + // 新建第一个工作表 + WritableSheet sheet1 = workbook.createSheet("Sheet1", 0); + /* + * 从第1行开始,保存图中每个类的信息 + */ + Set ClsImg_shortName_set = mapped_classes.keySet(); + int row = 0; + for (String ClsImg_shortName : ClsImg_shortName_set) { + UMLClass UML_class = mapped_classes.get(ClsImg_shortName); + row = saveUMLCls(sheet1, row, UML_class); + } + /* + * 继续保存代码里每个类的信息 + */ + Set ClsCode_fullName_set = classFullName_javaFileDir_map.keySet(); + for (String ClsCode_fullName : ClsCode_fullName_set) { + String cls_file_dir = classFullName_javaFileDir_map.get(ClsCode_fullName); + row = saveCodeCls(sheet1, row, ClsCode_fullName, cls_file_dir); + } + // 写入文件 + workbook.write(); + workbook.close(); + } + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private static int saveUMLCls(WritableSheet sheet, int row, UMLClass UML_class) { + try { + sheet.addCell(new Label(0, row, UML_class.getTitle() + " " + UML_class.getAttrisStr().replaceAll("\n", " ") + " " + UML_class.getMethodsStr().replaceAll("\n", " "))); + row++; + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return row; + } + + private static int saveCodeCls(WritableSheet sheet, int row, String ClsCode_fullName, String cls_file_dir) { + String content = ""; + String temp = ClsCode_fullName.substring(0, ClsCode_fullName.lastIndexOf(".")); + String pck = temp.substring(temp.lastIndexOf(".") + 1); + String cls_shortName = getClsShortNameFromFullName(ClsCode_fullName); + content += pck + "." + cls_shortName; + try { + CompilationUnit clsCode = StaticJavaParser.parse(new File(cls_file_dir)); + Optional o_class = clsCode.getClassByName(cls_shortName); + if (o_class.isPresent()) { + ClassOrInterfaceDeclaration clsCode_unit = o_class.get(); + /* + * 先比较parent_cls的所有属性 + */ + List ClsCode_fields = clsCode_unit.getFields(); + // 对比UMLAttris和ClsCode_fields + for (FieldDeclaration attri : ClsCode_fields) { + content += " " + attri.getVariable(0).getNameAsString(); + } + /* + * 再比较parent_cls的所有方法 + */ + List ClsCode_methods = clsCode_unit.getMethods(); + // 对比UMLMethods和ClsCode_methods + for (MethodDeclaration method : ClsCode_methods) { + content += " " + method.getNameAsString(); + } + } + } catch (FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + try { + sheet.addCell(new Label(0, row, content)); + row++; + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return row; + } + + /** + * 通过类全称,获得类短称 + */ + private static String getClsShortNameFromFullName(String clsCode_fullName) { + return clsCode_fullName.substring(clsCode_fullName.lastIndexOf(".") + 1); + } + + private static void generate(String xls_dir, String txt_dir) { + FileEditor fe = new FileEditor(txt_dir); + fe.write("", false); + try { + // 工作簿 + Workbook book = Workbook.getWorkbook(new File(xls_dir)); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("Sheet1"); + // Sheet sheet = book.getSheet(0); + int rows = sheet.getRows(); + for (int row = 0; row < rows; row++) { + String class_info = sheet.getCell(0, row).getContents(); + // 根据python脚本格式编写内容 + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + // 给我好好写!!! + fe.write(class_info, true); + } + book.close(); + } catch (BiffException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java b/src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIParser.java similarity index 76% rename from src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java rename to src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIParser.java index 0242f12dc821b109a411f93ffdf13cf1ab8f6d03..265744a70ac3c6004b0021263ad4d8c3f7e6bf2f 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java +++ b/src/main/java/com/hy/java/uct/cdtocode/vsmlsi/VSMAndLSIParser.java @@ -1,4 +1,4 @@ -package com.hy.java.uct.cdtocode; +package com.hy.java.uct.cdtocode.vsmlsi; /** * 解析VSM和LSI结果文件(csv文件) diff --git a/src/main/java/com/hy/java/uct/util/UMLClass.java b/src/main/java/com/hy/java/uct/util/UMLClass.java index eb53a305ebd8a8c1778b7390f95322f8008559bc..3426345f7efba8e4ff0e5c027112e39e956a363b 100644 --- a/src/main/java/com/hy/java/uct/util/UMLClass.java +++ b/src/main/java/com/hy/java/uct/util/UMLClass.java @@ -44,20 +44,27 @@ public class UMLClass { public Set mappedEnt_names = new HashSet<>(); /** - * 这个类所真正映射到的文件list(由于抽象层次不同,所以可能不止一个java文件是真正的映射) + * 这个类映射到的文件list(由于抽象层次不同,所以可能不止一个java文件是真正的映射) *

- * 每个映射保存了java文件地址和P(Cls_img,Cls_code) + * 这个list里可能会有重复的结果,所以还需要过滤 + *

+ * 每个映射保存了java文件地址和P(Cls_img,Cls_code,R) + */ + public List duplicated_mappedJavaFiles_underDiffRelations = new ArrayList<>(); + + /** + * 这个类所真正映射到的文件list(在每条关系R下的映射) *

- * 这个List里可能会有重复的结果,所以还需要过滤 + * 每个映射保存了java文件地址和P(Cls_img,Cls_code,R) */ - public List duplicated_mapped_javaFile_ls = new ArrayList<>(); + public List mappedJavaFiles_underDiffRelations = new ArrayList<>(); /** - * 这个类所真正映射到的文件list(由于抽象层次不同,所以可能不止一个java文件是真正的映射) + * 这个类所真正映射到的文件list *

* 每个映射保存了java文件地址和P(Cls_img,Cls_code) */ - public List mapped_javaFile_ls = new ArrayList<>(); + public List mappedJavaFiles = new ArrayList<>(); public String getTitle() { return title; diff --git a/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSI.txt b/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSI.txt new file mode 100644 index 0000000000000000000000000000000000000000..700433cd222be4e07037b99e65ac5bf5286ce4bd --- /dev/null +++ b/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSI.txt @@ -0,0 +1 @@ +Reference Versioner Element Metadata ProductType Product exceptions.CacheException serialVersionUIDexamples.MimeTypeExtractor doConfigure doExtractstructs.Product productId productName productType productStructure references transferStatus rootRef STATUS_TRANSFER STATUS_RECEIVED STRUCTURE_FLAT STRUCTURE_HIERARCHICAL STRUCTURE_STREAM VALID_STRUCTURES LOG productReceivedTime getProductType setProductType getProductStructure setProductStructure getProductReferences setProductReferences getProductName setProductName getProductId setProductId getTransferStatus setTransferStatus getRootRef setRootRef getDefaultFlatProduct toXML parse setProductRecievedTime getProductReceivedTime toStringexceptions.IngestException serialVersionUID maindatatransfer.S3DataTransferer s3Client bucketName encrypt setFileManagerUrl transferProduct retrieveProduct deleteProduct stageFile stripProtocolaction.GetProductByIdCliAction productId getProduct setProductIdquery.QueryResult product metadata toStringFormat getProduct setProduct getMetadata setMetadata getToStringFormat setToStringFormat toString convertMetadataToString concatMetadataIntoString equals hashCodecatalog.LuceneCatalog LOG logger indexDir reader indexFilePath valLayer CATALOG_CACHE generator pageSize writeLockTimeout commitLockTimeout mergeFactor addMetadata removeMetadata addProduct modifyProduct removeProduct setProductTransferStatus addProductReferences getProductById getProductById getCompleteProductById getCompleteProductById getCompleteProductById getProductByName getProductByName getProductReferences getProducts getProducts getProductsByProductType getProductsByProductType getMetadata getReducedMetadata query getTopNProducts getTopNProducts getValidationLayer getNumProducts getFirstPage getLastProductPage getNextPage getPrevPage pagedQuery removeProductDocument addCompleteProductToIndex toCompleteProduct toCompleteProduct toDoc hasMetadataAndRefs getNumHits paginateQuery getQuery quietGetElementsdatatransfer.RemoteDataTransferer NUM_BYTES fileManagerUrl chunkSize client LOG setFileManagerUrl transferProduct retrieveProduct deleteProduct remoteTransfer quietNotifyTransferProduct quietNotifyProductTransferCompletecatalog.DataSourceCatalogFactory VAL VAL1 dataSource validationLayer fieldIdStr pageSize cacheUpdateMinutes lenientFields productIdString orderedValues configure createCatalogrpc.FileManagerServerFactoryutil.XmlStructFactory LOG getProductType writeProductTypeMapXmLDocument writeElementXmlDocument writeProductTypeXmlDocument getProductTypeMapXmlDocument getElementXmlDocument getProductTypeXmlDocument getElement getProductTypeElementList friendlyXmldatatransfer.InPlaceDataTransferer LOG client setFileManagerUrl transferProduct retrieveProduct deleteProductexamples.FinalFileLocationExtractor replaceLocation doConfigure doExtract scrubRefsutil.XmlRpcStructFactory getXmlRpcFileTransferStatus getFileTransferStatusFromXmlRpc getXmlRpcFileTransferStatuses getFileTransferStatusesFromXmlRpc getXmlRpcProductPage getProductPageFromXmlRpc getXmlRpcComplexQuery getComplexQueryFromXmlRpc getXmlRpcQueryFilter getQueryFilterFromXmlRpc getXmlRpcFilterAlgor getFilterAlgorFromXmlRpc getXmlRpcQueryResults getQueryResultsFromXmlRpc getXmlRpcQueryResult getQueryResultFromXmlRpc getXmlRpcProduct getProductFromXmlRpc getProductListFromXmlRpc getXmlRpcProductList getXmlRpcProductTypeList getProductTypeListFromXmlRpc getXmlRpcProductType getProductTypeFromXmlRpc getXmlRpcTypeExtractors getXmlRpcExtractorSpec getXmlRpcTypeHandlers getXmlRpcTypeHandler getTypeExtractorsFromXmlRpc getExtractorSpecFromXmlRpc getTypeHandlersFromXmlRpc getTypeHandlerFromXmlRpc getPropertiesFromXmlRpc getXmlRpcProperties getXmlRpcReferences getReferencesFromXmlRpc getXmlRpcReference getReferenceFromXmlRpc getReferenceFromXmlRpcHashtable getXmlRpcElementListHashtable getXmlRpcElementList getElementListFromXmlRpc getXmlRpcElement getXmlRpcElementHashTable getElementFromXmlRpc getXmlRpcQuery getQueryFromXmlRpc getXmlRpcQueryCriteriaList getQueryCriteriaListFromXmlRpc getXmlRpcQueryCriteria getQueryCriteriaFromXmlRpcexamples.MajorMinorVersionTypeHandler getCatalogValue getOrigValuesystem.FileManager LOG logger catalog repositoryManager dataTransfer transferStatusTracker expandProductMet configurationManager setCatalog refreshConfigAndPolicy transferringProduct getCurrentFileTransfer getCurrentFileTransfers getProductPctTransferred getRefPctTransferred removeProductTransferStatus isTransferComplete pagedQuery getFirstPage getLastPage getNextPage getPrevPage addProductType setProductTransferStatus getNumProducts getTopNProducts getTopNProductsByProductType hasProduct getProductTypes getProductReferences getProductById getProductByName getProductsByProductType getElementsByProductType getElementById getElementByName complexQuery getProductTypeByName getProductTypeById updateMetadata ingestProduct retrieveFile transferFile moveProduct removeFile modifyProduct removeProduct catalogProduct addMetadata runExtractors addProductReferences setProductType query getReducedMetadata getMetadata getOrigValues getCatalogValues getCatalogQuery applyFilterToResults sortQueryResultList buildProductMetadata loadConfiguration setDataTransferaction.GetPrevPageCliAction productTypeName currentPageNum execute setProductTypeName setCurrentPageNumextractors.AbstractFilemgrMetExtractor LOG configuration extractMetadata configure doExtract doConfigure validateProduct addMetadataIfUndefined merge getProductFile getRootRefPathstructs.FreeTextQueryCriteria serialVersionUID elementName values noiseWords noiseWordHash getValues setValue addValue addFreeText getElementName setElementName toStringingest.CmdLineIngester LOG main readProdFilesFromStdinsystem.AvroFileManagerServer logger port server fileManager startUp loadConfiguration shutdown isAlive setCatalog refreshConfigAndPolicy transferringProduct getCurrentFileTransfer getCurrentFileTransfers getProductPctTransferred getRefPctTransferred removeProductTransferStatus isTransferComplete pagedQuery getFirstPage getLastPage getNextPage getPrevPage setProductTransferStatus getNumProducts getTopNProductsByProductType getTopNProducts hasProduct getMetadata getReducedMetadata getProductTypes getProductReferences getProductById getProductByName getProductsByProductType getElementsByProductType getElementById getElementByName complexQuery query getProductTypeByName getProductTypeById updateMetadata addProductType catalogProduct addMetadata addProductReferences ingestProduct retrieveFile transferFile moveProduct removeFile modifyProduct removeProduct getCatalogValues getOrigValues getCatalogQuerysystem.FileManagerClientMain mainmetadata.CoreMetKeysrepository.XMLRepositoryManager productTypeHomeUris productTypeMap LOG addProductType modifyProductType removeProductType getProductTypeById getProductTypeByName getProductTypes saveProductTypes loadProductTypes getDocumentRootaction.SqlQueryCliAction query getQuery setQuerydatatransfer.DataTransfersolr.UUIDProductIdGenerator generateIdtools.MetadataBasedProductMover DOUBLE pathSpec fmgrClient LOG moveProducts locationsMatch moveProducts mainaction.GetProductPercentTransferredCliAction productId productTypeName execute setProductId setProductTypeNametools.CASAnalyzer stopSet factory STOP_WORDS reader createComponents tokenStreamsvalidation.ScienceDataValidationLayerFactory dataSource createValidationLayeraction.AbstractQueryCliAction sortBy outputFormat delimiter filterAlgor startDateTimeMetKey endDateTimeMetKey priorityMetKey versionConverter execute getQuery setSortBy setOutputFormat setDelimiter setFilterAlgor setStartDateTimeMetKey setEndDateTimeMetKey setPriorityMetKey setVersionConverterrepository.ScienceDataRepositoryManagerFactory dataSource createRepositoryManagersolr.ProductSerializeringest.StdIngester LOG fmClient clientTransferServiceFactory ingest ingest ingest hasProduct check checkOrSetFileManager setFileManager getProductType hasProduct closesolr.CompleteProduct product metadata getProduct setProduct getMetadata setMetadatacatalog.LenientDataSourceCatalog LOG isoFormat dbFormat addMetadata getMetadataTypes removeMetadata getMetadata populateProductMetadata getReducedMetadata addMetadataValue removeMetadataValue getResultListSize getSqlQuerytools.QueryTool freeTextBlock client LOG query parseQuery generateCASQuery main safeGetProductTypes performSqlQuery exittools.MetadataDumper LOG fmClient FILENAME PRODUCT_NAME getMetadata writeMetFileToDir dumpMetadata dumpMetadata mainvalidation.XMLValidationLayerFactory dirList LOG createValidationLayerversioning.ProductTypeMetVersioner createDataStoreReferencesexceptions.VersioningException serialVersionUIDvalidation.ValidationLayerFactoryingest.RemoteableCachesolr.DefaultProductSerializer LOG getMimeType serialize serialize deserialize serialize addKeyValueToMap generateInsertDocuments generateUpdateDocuments toDoc encodeIndexField encodeUpdateField deserialize parseXml deserializeSingleValueField deserializeMultiValueFieldvalidation.ScienceDataValidationLayer LOG ds addElement addElementToProductType getElementById getElementByName getElements getElements modifyElement removeElement removeElementFromProductTypedatatransfer.RemoteDataTransferFactory chunkSize LOG createDataTransfersolr.SolrCatalogFactory solrUrl productIdGenerator productSerializer LOG configure createCatalogversioning.DateTimeVersioner LOG createDataStoreReferences addProductDateTimeToReferencessystem.CommonsXmlRpcTransport url client userAgentHeader http11 gzip rgzip creds method timeout connecttimeout password user auth sendXmlRpc setHttp11 setGzip setRGzip setUserAgent setTimeout setConnectionTimeout setBasicAuthentication setBasicAuthentication endClientRequestrpc.XmlRpcFileManagerServerFactory port getPort setPort createFileManagerServertype.TypeHandler elementName setElementName getElementName preQueryHandle handleQueryCriteria postGetMetadataHandle preAddMetadataHandle handleRangeQueryCriteria handleTermQueryCriteriautil.QueryUtils getQueryResultsAsString getQueryResultsAsStringtools.ExpImpCatalog sourceClient destClient srcCatalog destCatalog ensureUnique LOG doExpImport doExpImport exportTypeToDest exportProductsToDest isEnsureUnique setEnsureUnique main typesExist typeInList safeHasProductTypeByNameexceptions.CatalogException serialVersionUIDstructs.BooleanQueryCriteria AND OR NOT serialVersionUID operator terms addTerm getTerms setOperator getOperator getElementName setElementName toStringtools.OptimizeLuceneCatalog DOUBLE INT reader config catalogPath mergeFactor LOG doOptimize mainsystem.XmlRpcFileManagerClient client LOG fileManagerUrl dataTransfer refreshConfigAndPolicy isAlive transferringProduct removeProductTransferStatus isTransferComplete moveProduct modifyProduct removeProduct getCurrentFileTransfer getCurrentFileTransfers getProductPctTransferred getRefPctTransferred pagedQuery getFirstPage getLastPage getNextPage getPrevPage addProductType hasProduct getNumProducts getTopNProducts getTopNProducts setProductTransferStatus addProductReferences addMetadata updateMetadata catalogProduct getMetadata getReducedMetadata removeFile retrieveFile transferFile getProductsByProductType getElementsByProductType getElementById getElementByName getElementByName complexQuery query getProductTypeByName getProductTypeById getProductTypes getProductReferences getProductById getProductByName ingestProduct getCatalogValues getOrigValues getCatalogQuery main getFileManagerUrl setFileManagerUrl getDataTransfer setDataTransfer closesolr.SolrCatalog productSerializer productIdGenerator solrClient LOG addMetadata removeMetadata addProduct modifyProduct removeProduct setProductTransferStatus addProductReferences getProductById getProductByName getProductReferences getProducts getProducts getProducts getProductsByProductType getMetadata getReducedMetadata query getTopNProducts pagedQuery getFirstPage getLastProductPage getNextPage getPrevPage getProductsFromDocument getTopNProducts getValidationLayer getNumProducts getCompleteProductById getCompleteProductByName extractCompleteProduct newProductPagedatatransfer.TransferStatusTracker currentProductTransfers catalog LOG getCurrentFileTransfer transferringProduct getCurrentFileTransfers getPctTransferred getPctTransferred removeProductTransferStatus isTransferComplete getBytesTransferred quietGetReferences isDirdatatransfer.InPlaceDataTransferFactory createDataTransfertools.SolrIndexer LOG SOLR_INDEXER_CONFIG SOLR_URL FILEMGR_URL ACCESS_KEY ACCESS_URL PRODUCT_NAME config server fmUrl solrUrl solrFormat delete commit optimize getSolrDocument indexMetFile indexProductTypes safeFirstPage indexAll indexProduct indexProductByName indexProduct deleteProduct deleteProductByName deleteProductFromIndex performSubstitution formatDate buildCommandLine main readProductIdsFromStdinexamples.DateTimeExpandMetExtractor FULL_DATE_TIME_KEY FULL_DATE_TIME_FORMAT YEAR_KEY MONTH_KEY DAY_KEY HOUR_KEY MINUTE_KEY SECOND_KEY fullDateTimeKey fullDateTimeFormat yearKey monthKey dayKey hourKey minuteKey secondKey doConfigure doExtract getKey getFullDateTime createDateFieldaction.AddProductTypeCliAction productTypeId productTypeName productTypeDescription fileRepositoryPath versioner execute setProductTypeId setProductTypeName setProductTypeDescription setFileRepositoryPath setVersionerexceptions.ConnectionException serialVersionUIDtools.ProductTypeDocTool xslFilePath outputDirPath doProductTypeDoc mainaction.GetProductByNameCliAction productName getProduct setProductNamesystem.FileManagerServerMain mainvalidation.DataSourceValidationLayer LOG dataSource quoteFields addElement modifyElement removeElement addElementToProductType removeElementFromProductType addParentToProductType removeParentFromProductType getElements getElements getElementById getElementByNameversioning.ConfigurableMetadataBasedFileVersioner BASE_PROPERTY ALL properties createDataStoreReferences getFilePathSpecingest.LocalCacheFactory createCacherpc.AvroFileManagerServerFactory port createFileManagerServer getPort setPortaction.GetCurrentTransfersCliAction executecatalog.Catalogaction.FileManagerCliAction fmc getUrl getClient setClientstructs.FileTransferStatus fileRef bytesTransferred parentProduct getBytesTransferred setBytesTransferred getFileRef setFileRef getParentProduct setParentProduct computePctTransferredutil.AvroTypeFactory logger getAvroReference getReference getAvroExtractorSpec getExtractorSpec getAvroMetadata getMetadata getAvroTypeHandler getTypeHandler getAvroProductType getProductType getAvroProduct getProduct getAvroFileTransferStatus getFileTransferStatus getAvroQueryCriteria getQueryCriteria getAvroQuery getQuery getAvroProductPage getProductPage getAvroElement getElement getAvroQueryResult getQueryResult getAvroFilterAlgor getFilterAlgor getAvroQueryFilter getQueryFilter getAvroComplexQuery getComplexQuery getReferences getAvroReferencesversioning.BasicVersioner LOG createDataStoreReferencescatalog.MappedDataSourceCatalog typeMap addMetadata addProductReferences getMetadata getProductReferences modifyProduct removeMetadata removeProduct pagedQuery query getProductTypeTableNameextractors.FilemgrMetExtractorutil.GenericFileManagerObjectFactory LOG getDataTransferServiceFromFactory getRepositoryManagerServiceFromFactory getCatalogServiceFromFactory getValidationLayerFromFactory getCacheFromFactory getVersionerFromClassName getExtractorFromClassName getTypeHandlerFromClassName getFilterAlgorFromClassName getVersionConverterFromClassNamesystem.FileManagerClientdatatransfer.LocalDataTransferer LOG client setFileManagerUrl transferProduct retrieveProduct deleteProduct main copyDirToDir moveDirToProductRepo moveFilesToProductRepo copyFilesToDir moveFile copyFile quietNotifyTransferProduct quietNotifyProductTransferComplete finalizeaction.DeleteProductByIdCliAction productId getProductToDelete setProductIdexceptions.FileManagerExceptionaction.AbstractDeleteProductCliAction execute getProductToDeleterepository.DataSourceRepositoryManager dataSource LOG addProductType modifyProductType removeProductType getProductTypeById getProductTypeByName getProductTypesingest.RmiCacheFactory rmiCacheServerUrn createCacheversioning.Versioneraction.AbstractGetProductCliAction execute getProducttype.ValueReplaceTypeHandler postGetMetadataHandle preAddMetadataHandle handleRangeQueryCriteria handleTermQueryCriteria getCatalogValue getOrigValuedatatransfer.DataTransferFactorystructs.Query criteria getCriteria setCriteria addCriterion toStringaction.DeleteProductByNameCliAction productName getProductToDelete setProductNameaction.HasProductCliAction productName execute setProductNamerepository.XMLRepositoryManagerFactory productTypeDirList LOG createRepositoryManagerexceptions.DataTransferException serialVersionUIDauth.Dispatchersolr.Parameters ID NS PRODUCT_ID PRODUCT_NAME PRODUCT_STRUCTURE PRODUCT_TRANSFER_STATUS PRODUCT_RECEIVED_TIME PRODUCT_TYPE_NAME PRODUCT_TYPE_ID REFERENCE_ORIGINAL REFERENCE_DATASTORE REFERENCE_FILESIZE REFERENCE_MIMETYPE ROOT ROOT_REFERENCE_ORIGINAL ROOT_REFERENCE_DATASTORE ROOT_REFERENCE_FILESIZE ROOT_REFERENCE_MIMETYPE SOLR_DATE_FORMAT SOLR_DATE_TIME_FORMATTER MIME_TYPE_XML MIME_TYPE_JSON PAGE_SIZE NULLaction.GetNextPageCliAction productTypeName currentPageNum execute setProductTypeName setCurrentPageNumvalidation.DataSourceValidationLayerFactory dataSource quoteFields createValidationLayerdatatransfer.LocalDataTransferFactory createDataTransferaction.DumpMetadataCliAction productId outputDir execute setProductId setOutputDir generateFilenamestructs.ExtractorSpec className configuration getClassName setClassName getConfiguration setConfigurationutil.RpcCommunicationFactory LOG getClientFactoryName setPror createClient createClient createServerversioning.MetadataBasedFileVersioner filePathSpec LOG flatProducts createDataStoreReferences parseFilePathSpec getFilePathSpec setFilePathSpec isFlatProducts setFlatProductssolr.NameProductIdGenerator generateIdutil.DbStructFactory getProductType getProduct getProduct getReference getElement getParent toScienceDataElement toScienceDataProduct toScienceDataProductTypecatalog.ScienceDataCatalog dataSource validationLayer pageSize LOG addMetadata createGranule createParameter commitQuery addProduct addProductReferences getMetadata getNumProducts getProductById getProductByName getProductReferences getProducts getProductsByProductType getReducedMetadata getTopNProducts getTopNProducts getValidationLayer modifyProduct query removeMetadata removeProduct setProductTransferStatus getFirstPage getLastProductPage getNextPage getPrevPage pagedQuery paginateQuery getResultListSize getStartDateTime getEndDateTime getMetadataSubsetexamples.TikaAutoDetectExtractor doConfigure doExtract getMetadataFromTika transformdatatransfer.S3DataTransfererFactory BUCKET_NAME_PROPERTY REGION_PROPERTY ACCESS_KEY_PROPERTY SECRET_KEY_PROPERTY ENCRYPT_PROPERTY createDataTransferaction.GetFilePercentTransferredCliAction origRef execute setOrigRef getUrirepository.DataSourceRepositoryManagerFactory dataSource createRepositoryManagervalidation.ValidationLayersystem.AvroFileManagerClient logger client proxy fileManagerUrl dataTransfer refreshConfigAndPolicy isAlive transferringProduct removeProductTransferStatus isTransferComplete moveProduct modifyProduct removeProduct getCurrentFileTransfer getCurrentFileTransfers getProductPctTransferred getRefPctTransferred pagedQuery getFirstPage getLastPage getNextPage getPrevPage addProductType hasProduct getNumProducts getTopNProducts getTopNProducts setProductTransferStatus addProductReferences addMetadata updateMetadata catalogProduct getMetadata getReducedMetadata removeFile retrieveFile transferFile getProductsByProductType getElementsByProductType getElementById getElementByName complexQuery query getProductTypeByName getProductTypeById getProductTypes getProductReferences getProductById getProductByName ingestProduct getCatalogValues getOrigValues getCatalogQuery getFileManagerUrl setFileManagerUrl getDataTransfer setDataTransfer closecatalog.DataSourceCatalog INT dataSource LOG validationLayer fieldIdStringFlag pageSize productIdString orderedValues PRODUCT_CACHE cacheUpdateMinutes addMetadata removeMetadata addProduct modifyProduct removeProduct setProductTransferStatus addProductReferences getProductById getProductByName getProductReferences getProducts getProductsByProductType getMetadata getReducedMetadata query getTopNProducts getTopNProducts getValidationLayer addMetadataValue removeMetadataValue getNumProducts getFirstPage getLastProductPage getNextPage getPrevPage pagedQuery getResultListSize stillFresh getProductsFromCache flagCacheUpdate getProductsByProductTypeCached paginateQuery getSqlQuery updateReferences quoteItrepository.RepositoryManagerFactorycatalog.CatalogFactoryaction.IngestProductCliAction LOG productName productStructure productTypeName metadataFile dataTransferer references execute setProductName setProductStructure setProductTypeName setMetadataFile setDataTransferer setReferences getUrisolr.ProductIdGeneratorstructs.RangeQueryCriteria serialVersionUID elementName startValue endValue inclusive getStartValue setStartValue getEndValue setEndValue getInclusive setInclusive getElementName setElementName toStringstructs.TermQueryCriteria serialVersionUID elementName value getValue setValue getElementName setElementName toStringexamples.FileAttributesExtractor BASIC_FILE_ATTRIBUTES POSIX_FILE_ATTRIBUTES attributes LOG doExtract doConfigure getMetadataFromFileAttributesaction.GetLastPageCliAction productTypeName execute setProductTypeNameaction.LuceneQueryCliAction FREE_TEXT_BLOCK query reducedProductTypes reducedMetadataKeys getQuery setQuery setReducedProductTypes setReducedMetadataKeys parseQuery generateCASQueryquery.ComplexQuery reducedProductTypeNames reducedMetadata queryFilter sortByMetKey toStringResultFormat getReducedProductTypeNames setReducedProductTypeNames getReducedMetadata setReducedMetadata getQueryFilter setQueryFilter getSortByMetKey setSortByMetKey getToStringResultFormat setToStringResultFormattools.DumpDbElementsToXml mainaction.GetFirstPageCliAction productTypeName execute setProductTypeNameaction.RetrieveFilesCliAction productId productName dt destination execute setProductId setProductName setDataTransferFactory setDestinationconv.VersionConverterversioning.InPlaceVersioner LOG createDataStoreReferencesingest.Ingesteringest.CachedIngester cache LOG hasProduct hasProduct resynsc init closeversioning.SingleFileBasicVersioner FILENAME_FIELD LOG createDataStoreReferencesingest.RmiCache LOG rmiCacheServer clear contains setFileManager size sync sync setUniqueElementProductTypeNames sync setUniqueElementName getFileManagerUrlingest.RmiCacheServer LOG serialVersionUID cache uniqueElementName uniqueElementProductTypeNames reg launchServer launchServer stopServer clear contains setFileManager size sync sync sync getFileManagerUrl getUniqueElementProductTypeNames setUniqueElementProductTypeNames getUniqueElementName setUniqueElementName syncWith launchRmiServercatalog.ScienceDataCatalogFactory dataSource pageSize createCatalogexceptions.ValidationLayerException serialVersionUIDstructs.ProductType productTypeId name description productRepositoryPath versioner typeMetadata extractors handlers getDescription setDescription getName setName getProductTypeId setProductTypeId getProductRepositoryPath setProductRepositoryPath getVersioner setVersioner getTypeMetadata setTypeMetadata getExtractors setExtractors getHandlers setHandlers toString blankProductTypevalidation.XMLValidationLayer LOG productTypeElementMap subToSuperMap elementMap xmlFileDirUris addElement modifyElement removeElement addElementToProductType removeElementFromProductType getElements getElements getElementById getElementByName getElements getSubToSuperMap addParentForProductType removeParentForProductType saveElementsAndMappings loadElements loadProductTypeMap getDocumentRootstructs.Reference LOG origReference dataStoreReference fileSize mimeType mimeTypeRepository STREAM_REFERENCE_DELIMITER getDataStoreReference setDataStoreReference getOrigReference setOrigReference getFileSize setFileSize getMimeType setMimeType setMimeType toStringstructs.Element elementId elementName dcElement description hashCode getDCElement setDCElement getElementId setElementId getElementName setElementName getDescription setDescription blankElementextractors.CoreMetExtractor namespaceAware elementNs nsSeparator nsReplaceElements doExtract doConfigure isNsReplaceexamples.FilenameRegexMetExtractor filenamePattern metadataKeys doConfigure doExtractfilter.TimeEvent startTime priority setPriority getPriority getStartTime getEndTime getDuration equals toString happenAtSameTime compareTo hashCodecatalog.MappedDataSourceCatalogFactory typeMap TYPE_MAP_KEY createCatalogfilter.ObjectTimeEvent timeObj getTimeObject equals toString hashCodequery.QueryResultComparator sortByMetKey getSortByMetKey setSortByMetKey compareexceptions.QueryFormulationException serialVersionUIDconv.AsciiSortableVersionConverter convertToPriorityauth.SecureWebServer dispatchers execute addDispatcherrepository.ScienceDataRepositoryManager LOG dataSource addProductType getProductTypeById getProductTypeByName getProductTypes modifyProductType removeProductTypecatalog.LuceneCatalogFactory VAL VAL1 VAL2 VAL3 indexFilePath config validationLayer pageSize writeLockTimeOut commitLockTimeOut mergeFactor lenientFields LOG createCatalogingest.CacheFactoryrepository.RepositoryManagerexceptions.RepositoryManagerException serialVersionUIDsystem.FileManagerServerversioning.VersioningUtils LOG DIR_FILTER FILE_FILTER getReferencesFromDir getURIsFromDir createBasicDataStoreRefsHierarchical createBasicDataStoreRefsFlat createBasicDataStoreRefsStream createDataStoreRefStream addRefsFromUris getAbsolutePathFromUri quietGetFileSizeFromUriversioning.DirectoryProductVersioner createDataStoreReferencesquery.QueryFilter startDateTimeMetKey filterAlgor converter getStartDateTimeMetKey setStartDateTimeMetKey getEndDateTimeMetKey setEndDateTimeMetKey getPriorityMetKey setPriorityMetKey getFilterAlgor setFilterAlgor getConverter setConverteraction.GetNumProductsCliAction productTypeName execute setProductTypeNameaction.GetProductTypeByNameCliAction productTypeName execute setProductTypeNamefilter.FilterAlgor epsilon setEpsilon getEpsilon filterEventssolr.SolrClient solrUrl LOG index delete queryProductById queryProductByName queryProductsByDate queryProductsByDateAndType commit query doGet doPost doHttp buildUpdateUrl buildSelectUrlutil.Paginationrpc.XmlRpcFileManagerClientFactory url testConnection setUrl getUrl setTestConnection getTestConnection createFileManagerClientsystem.XmlRpcFileManager webServerPort catalog repositoryManager dataTransfer LOG webServer transferStatusTracker expandProductMet configurationManager configurationListener setCatalog isAlive refreshConfigAndPolicy transferringProduct transferringProductCore getCurrentFileTransferCore getCurrentFileTransfers getProductPctTransferred getProductPctTransferredCore getRefPctTransferred getRefPctTransferredCore removeProductTransferStatus removeProductTransferStatusCore isTransferComplete isTransferCompleteCore pagedQuery pagedQueryCore getFirstPage getFirstPageCore getLastPage getLastPageCore getNextPage getNextPageCore getPrevPage getPrevPageCore addProductType addProductTypeCore setProductTransferStatus setProductTransferStatusCore getNumProducts getNumProductsCore getTopNProducts getTopNProducts getTopNProductsCore hasProduct getMetadata getMetadataCore getReducedMetadata getReducedMetadataCore getProductTypes getProductReferences getProductReferencesCore getProductById getProductByName getProductsByProductType getProductsByProductTypeCore getElementsByProductType getElementById getElementByName complexQuery complexQueryCore query queryCore getProductTypeByName getProductTypeById updateMetadata updateMetadataCore catalogProduct catalogProductCore addMetadata addMetadataCore addProductReferencesCore addProductReferences ingestProduct ingestProductCore retrieveFile transferFile moveProduct moveProductCore removeFile modifyProduct modifyProductCore removeProduct removeProduct getCatalogValues getCatalogValuesCore getOrigValues getOrigValuesCore getCatalogQuery getCatalogQueryCore main shutdown catalogProduct addMetadata runExtractors addProductReferences setProductType query getReducedMetadata getMetadata getOrigValues getCatalogValues getCatalogQuery applyFilterToResults sortQueryResultList buildProductMetadata loadConfigurationtools.RangeQueryTester startFieldName endFieldName startFieldStartValue startFieldEndValue endFieldStartValue endFieldEndValue indexPath LOG reader doRangeQuery main getEndFieldName setEndFieldName getIndexPath setIndexPath getStartFieldName setStartFieldName getEndFieldEndValue setEndFieldEndValue getEndFieldStartValue setEndFieldStartValue getStartFieldEndValue setStartFieldEndValue getStartFieldStartValue setStartFieldStartValuestructs.ProductPage pageNum totalPages pageSize pageProducts numOfHits getPageNum setPageNum getPageProducts setPageProducts getPageSize setPageSize getTotalPages setTotalPages isLastPage isFirstPage getNumOfHits setNumOfHits blankPage toStringauth.Result value ARGS getValueversioning.AcquisitionDateVersioner filePathSpec ACQUISITION_DATE ACQ_DATE_FORMAT START_DATE_TIME createDataStoreReferencestools.DeleteProduct LOG client commit remove isCommit setCommit main readProdIdsFromStdin finalizeutil.SqlParser LOG parseSqlQueryMethod parseSqlQuery parseSqlWhereClause unparseSqlQuery getInfixCriteriaString getInfixCriteriaString stripOutSqlDefinition getSqlStatementArgs createFilter toPostFix parseStatement listToString mainrpc.FileManagerClientFactorymetadata.FileAttributesMetKeysrpc.AvroFileManagerClientFactory url testConnection setUrl getUrl setTestConnection getTestConnection createFileManagerClientsolr.QueryResponse numFound start completeProducts setNumFound setStart setResults getNumFound getStart getCompleteProducts getProductsstructs.QueryCriteria getElementName setElementName toStringingest.AbstractCacheServerFactory rangeQueryElementName rangeStartDateTime rangeEndDateTime uniqueElementName productTypeNames fmUrl createCachetools.ProductDumper LOG fmClient FILENAME PRODUCT_NAME getProduct writeProductFileToDir getMetadata dumpProduct dumpProduct mainingest.Cacheingest.LocalCache uniqueElements uniqueElementName uniqueElementProductTypeNames rangeQueryElementName startOfQuery endOfQuery fm LOG sync sync clear size contains setFileManager sync getFileManagerUrl liveHasProduct getUniqueElementName setUniqueElementName getUniqueElementProductTypeNames setUniqueElementProductTypeNames getProductsOverDateRange getProducts getValueForMetadata finalizeaction.GetCurrentTransferCliAction executetools.CatalogSearch LOG client freeTextBlock productFilter PostQuery PostQuery setFilter removeFilter ListProducts listElements listElements printHelp ParseQuery GenerateCASQuery CommandParser mainingest.RmiCacheServerFactory createCache createRemoteCache mainmetadata.ProductMetKeyssystem.XmlRpcFileManagerServer port webServer fileManager isAlive setCatalog startUp loadConfiguration refreshConfigAndPolicy transferringProduct getCurrentFileTransfer getCurrentFileTransfers getRefPctTransferred removeProductTransferStatus isTransferComplete pagedQuery getFirstPage getLastPage getNextPage getProductPctTransferred getPrevPage addProductType setProductTransferStatus getNumProducts getTopNProducts getTopNProducts hasProduct getMetadata getReducedMetadata getProductTypes getProductReferences getProductById getProductByName getProductsByProductType getElementsByProductType getElementById getElementByName complexQuery query getProductTypeByName getProductTypeById updateMetadata catalogProduct addMetadata addProductReferences ingestProduct retrieveFile transferFile moveProduct removeFile modifyProduct removeProduct getCatalogValues getOrigValues getCatalogQuery shutdown \ No newline at end of file diff --git a/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSIdata.xls b/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSIdata.xls new file mode 100644 index 0000000000000000000000000000000000000000..ebf4bd8c389d691a8fd282149f67cdcf94ac436e Binary files /dev/null and b/src/main/resources/cdtocode/Apache OODT File Manager-VSMLSIdata.xls differ diff --git a/src/main/resources/cdtocode/Apache OODT File Manager.xls b/src/main/resources/cdtocode/Apache OODT File Manager.xls index be9436a6b680c54fd213be2eda53e909fa6e657f..caf3033b9383a1b326c70f48c61630d3764bec6a 100644 Binary files a/src/main/resources/cdtocode/Apache OODT File Manager.xls and b/src/main/resources/cdtocode/Apache OODT File Manager.xls differ diff --git a/src/main/resources/cdtocode/Hadoop HDFS.xls b/src/main/resources/cdtocode/Hadoop HDFS.xls index a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64..83a6f0bd7ff591f1d004650c642ef0f29e41b00f 100644 Binary files a/src/main/resources/cdtocode/Hadoop HDFS.xls and b/src/main/resources/cdtocode/Hadoop HDFS.xls differ diff --git a/src/main/resources/cdtocode/Hadoop MapReduce.xls b/src/main/resources/cdtocode/Hadoop MapReduce.xls index a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64..d4c54fb030e2b4b00e464244e57e255372ff8c65 100644 Binary files a/src/main/resources/cdtocode/Hadoop MapReduce.xls and b/src/main/resources/cdtocode/Hadoop MapReduce.xls differ diff --git a/src/main/resources/cdtocode/cd/cd-Hadoop HDFS.txt b/src/main/resources/cdtocode/cd/cd-Hadoop HDFS.txt index 2395f46f95c58a70be6e56d9b142acee12af382d..e178252e3c8a8974b3a4d2485fa4440248e2cd61 100644 --- a/src/main/resources/cdtocode/cd/cd-Hadoop HDFS.txt +++ b/src/main/resources/cdtocode/cd/cd-Hadoop HDFS.txt @@ -1,60 +1,59 @@ -(611,446)AbstractHandler -@@@AbstractHandler -%AbstractLifeCycle -%继承¥AbstractHandler -%Handler -%继承¥@#(165,446)AbstractConnector -@@@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractConnector -%Connector -%继承¥@#(366,303)AbstractLifeCycle -@@+doStart() -+doStop() -@AbstractLifeCycle -%LifeCycle -%继承¥@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractHandler -%AbstractLifeCycle -%继承¥#(167,204)Connector -@+host: String -+port: int -@@Connector -%Buffers -%实现¥Connector -%LifeCycle -%实现¥@AbstractConnector -%Connector -%继承¥#(616,205)Handler -@@+handle(target,request,...) -@Handler -%LifeCycle -%实现¥@AbstractHandler -%Handler -%继承¥#(470,204)ThreadPool -@@+dispatch(Runnable) -@ThreadPool -%LifeCycle -%实现¥@#(110,19)Buffers -@@+getBuffer(size): Buffer -+returnBuffer(Buffer) -@Buffers -%Buffer -%依赖¥@Connector -%Buffers -%实现#(16,19)Buffer -@@@@Buffers -%Buffer -%依赖#(394,17)LifeCycle -@@+start() -+stop() -@@AbstractLifeCycle -%LifeCycle -%继承¥Connector -%LifeCycle -%实现¥ThreadPool -%LifeCycle -%实现¥Handler -%LifeCycle -%实现¥# \ No newline at end of file +(611,446)Balancer +@@@Balancer +%Protocol +%依赖¥Balancer +%NameNode +%依赖¥Balancer +%Security +%依赖¥Balancer +%Common +%依赖¥Balancer +%Client +%依赖¥@#(165,446)DataNode +@@@DataNode +%Protocol +%依赖¥DataNode +%Common +%依赖¥DataNode +%Security +%依赖¥DataNode +%Client +%依赖¥@#(366,303)NameNode +@@@NameNode +%Common +%依赖¥NameNode +%Protocol +%依赖¥NameNode +%Client +%依赖¥#(167,204)Protocol +@@@Protocol +%Common +%依赖¥Protocol +%NameNode +%依赖¥Protocol +%Security +%依赖¥rotocol +%Client +%依赖¥#(616,205)Common +@@@Common +%Protocol +%依赖¥Common +%NameNode +%依赖¥@#(470,204)Tools +@@@Tools +%Common +%依赖¥Tools +%Security +%依赖¥Tools +%NameNode +%依赖¥@#(110,19)Security +@@@Security +%DataNode +%依赖¥@#(394,17)Client +@@@Client +%Protocol +%依赖¥Client +%NameNode +%依赖¥Client +%Security +%依赖¥@# \ No newline at end of file diff --git a/src/main/resources/cdtocode/cd/cd-Hadoop MapReduce.txt b/src/main/resources/cdtocode/cd/cd-Hadoop MapReduce.txt index 2395f46f95c58a70be6e56d9b142acee12af382d..569333d8188e6f23ca7465b72a64604738acac30 100644 --- a/src/main/resources/cdtocode/cd/cd-Hadoop MapReduce.txt +++ b/src/main/resources/cdtocode/cd/cd-Hadoop MapReduce.txt @@ -1,60 +1,69 @@ -(611,446)AbstractHandler -@@@AbstractHandler -%AbstractLifeCycle -%继承¥AbstractHandler -%Handler -%继承¥@#(165,446)AbstractConnector -@@@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractConnector -%Connector -%继承¥@#(366,303)AbstractLifeCycle -@@+doStart() -+doStop() -@AbstractLifeCycle -%LifeCycle -%继承¥@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractHandler -%AbstractLifeCycle -%继承¥#(167,204)Connector -@+host: String -+port: int -@@Connector -%Buffers -%实现¥Connector -%LifeCycle -%实现¥@AbstractConnector -%Connector -%继承¥#(616,205)Handler -@@+handle(target,request,...) -@Handler -%LifeCycle -%实现¥@AbstractHandler -%Handler -%继承¥#(470,204)ThreadPool -@@+dispatch(Runnable) -@ThreadPool -%LifeCycle -%实现¥@#(110,19)Buffers -@@+getBuffer(size): Buffer -+returnBuffer(Buffer) -@Buffers -%Buffer -%依赖¥@Connector -%Buffers -%实现#(16,19)Buffer -@@@@Buffers -%Buffer -%依赖#(394,17)LifeCycle -@@+start() -+stop() -@@AbstractLifeCycle -%LifeCycle -%继承¥Connector -%LifeCycle -%实现¥ThreadPool -%LifeCycle -%实现¥Handler -%LifeCycle -%实现¥# \ No newline at end of file +(611,446)Input +@@@Input +%InputFormat +%依赖¥Input +%InputSplit +%依赖¥@InputFormat +%Input +%依赖¥InputSplit +%Input +%依赖¥#(165,446)Map +@@@Map +%Mapper +%依赖¥@Mapper +%Map +%依赖¥#(366,303)Partition +@@@Partition +%Partitioner +%依赖¥@Partitioner +%Partition +%依赖¥#(167,204)Reduce +@@@Reduce +%Reducer +%依赖¥@Reducer +%Reduce +%依赖¥#(616,205)Output +@@@Output +%OutputFormat +%依赖¥Output +%OutputCommitter +%依赖¥@OutputFormat +%Output +%依赖¥OutputCommitter +%Output +%依赖¥#(611,446)InputFormat +@@@InputFormat +%Input +%依赖¥@Input +%InputFormat +%依赖¥#(165,446)InputSplit +@@@InputSplit +%Input +%依赖¥@Input +%InputSplit +%依赖¥#(366,303)Mapper +@@@Mapper +%Map +%依赖¥@Map +%Mapper +%依赖¥#(167,204)Partitioner +@@@Partitioner +%Partition +%依赖¥@Partition +%Partitioner +%依赖¥#(616,205)Reducer +@@@Reducer +%Reduce +%依赖¥@Reduce +%Reducer +%依赖¥#(616,205)OutputFormat +@@@OutputFormat +%Output +%依赖¥@Output +%OutputFormat +%依赖#(616,205)OutputCommitter +@@@OutputCommitter +%Output +%依赖¥@Output +%OutputCommitter +%依赖¥# \ No newline at end of file diff --git a/src/main/resources/cdtocode/code/code path b/src/main/resources/cdtocode/code/code path-fm similarity index 100% rename from src/main/resources/cdtocode/code/code path rename to src/main/resources/cdtocode/code/code path-fm diff --git a/src/main/resources/cdtocode/code/code path-hdfs b/src/main/resources/cdtocode/code/code path-hdfs new file mode 100644 index 0000000000000000000000000000000000000000..040adc0172907a620c80b2a41f3ccf5b7da4cd24 --- /dev/null +++ b/src/main/resources/cdtocode/code/code path-hdfs @@ -0,0 +1 @@ +D:\eclipse-committers\Hadoop HDFS \ No newline at end of file diff --git a/src/main/resources/cdtocode/code/code path-mr b/src/main/resources/cdtocode/code/code path-mr new file mode 100644 index 0000000000000000000000000000000000000000..3663326dea100548cc3fcab66d57a3f53c606d25 --- /dev/null +++ b/src/main/resources/cdtocode/code/code path-mr @@ -0,0 +1 @@ +D:\eclipse-committers\Hadoop MapReduce \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt index 05c9aae02d64594a7f4889a91097c3dedad909d9..0e2538191b3b77492c1474355130619c96ca760f 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt @@ -458,6 +458,15 @@ XMLValidationLayerFactory,XML files XMLValidationLayerFactory,XML syntax versioning,versioning Versioner,Versioner +Versioner,class AcquisitionDateVersioner +Versioner,class BasicVersioner +Versioner,class ConfigurableMetadataBasedFileVersioner +Versioner,class DateTimeVersioner +Versioner,class DirectoryProductVersioner +Versioner,class InPlaceVersioner +Versioner,class MetadataBasedFileVersioner +Versioner,class ProductTypeMetVersioner +Versioner,class SingleFileBasicVersioner AcquisitionDateVersioner,AcquisitionDateVersioner BasicVersioner,BasicVersioner BasicVersioner,Versioner extension point @@ -655,6 +664,12 @@ MetadataBasedProductMover,Metadata management MetadataBasedProductMover,custom PGE Metadata MetadataBasedProductMover,Metadata generation MetadataDumper,MetadataDumper +Metadata,class FileAttributesMetKeys +Metadata,class ProductMetKeys +Metadata,class FilemgrMetExtractor +Metadata,class AbstractFilemgrMetExtractor +Metadata,class CoreMetExtractor +Metadata,class CoreMetKeys OptimizeLuceneCatalog,OptimizeLuceneCatalog OptimizeLuceneCatalog,XMLMetadataConceptCatalog OptimizeLuceneCatalog,Catalog extension point interface diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt index 86d84701d53f6f7aafc94d9d5d26256becbe1abe..fa4a2dec90f51473e5122c2fa80c2ebeb85a5499 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt @@ -458,6 +458,15 @@ XML Validation Layer Factory , XML files XML Validation Layer Factory , XML syntax versioning , versioning Versioner , Versioner +Versioner , class Acquisition Date Versioner +Versioner , class Basic Versioner +Versioner , class Configurable Metadata Based File Versioner +Versioner , class Date Time Versioner +Versioner , class Directory Product Versioner +Versioner , class In Place Versioner +Versioner , class Metadata Based File Versioner +Versioner , class Product Type Met Versioner +Versioner , class Single File Basic Versioner Acquisition Date Versioner , Acquisition Date Versioner Basic Versioner , Basic Versioner Basic Versioner , Versioner extension point @@ -655,6 +664,12 @@ Metadata Based Product Mover , Metadata management Metadata Based Product Mover , custom PGE Metadata Metadata Based Product Mover , Metadata generation Metadata Dumper , Metadata Dumper +Metadata , class File Attributes Met Keys +Metadata , class Product Met Keys +Metadata , class Filemgr Met Extractor +Metadata , class Abstract Filemgr Met Extractor +Metadata , class Core Met Extractor +Metadata , class Core Met Keys Optimize Lucene Catalog , Optimize Lucene Catalog Optimize Lucene Catalog , XML Metadata Concept Catalog Optimize Lucene Catalog , Catalog extension point interface diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..92b130e21fe3e51603fef0e4385eb504ae8592af --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW-relation.txt @@ -0,0 +1,111 @@ +we&version 0.21&依赖 +we&version 0.21&依赖 +we&hdf&依赖 +modular structure&hdf and version 0.21&AGGREGATION +we&modular structure&依赖 +we&modular structure&依赖 +we&hdf&依赖 +static analysis&source code&AGGREGATION +class and group&class ( module )&AGGREGATION +structure&figure&依赖 +[ module view&] 1.1 Modularity risk&依赖 +[ module view&] 1.1 Modularity risk&依赖 +[ module view&] 1.1 Modularity risk&依赖 +[ module view&] 1.1 Modularity risk&依赖 +[ module view&time&依赖 +it&structure& +[ module view&] 1.1 Modularity risk&依赖 +[ module view&software&依赖 +[ module view&time&依赖 +[ module view&time&依赖 +[ module view&software&依赖 +[ module view&time&依赖 +development&software&AGGREGATION +[ module view&] 1.1 Modularity risk&依赖 +[ module view&software&依赖 +[ module view&software&依赖 +[ module view&] 1.1 Modularity risk&依赖 +[ module view&] 1.1 Modularity risk&依赖 +section&characteristic&依赖 +section&four signal&依赖 +section&code&依赖 +characteristic&code&AGGREGATION +part&package&AGGREGATION +class or group&class&AGGREGATION +its&own& +class or group&more dependency ( incoming or outgoing )&依赖 +they&code&依赖 +module structure&structure&GENERALIZATION +signal&module structure&依赖 +package hdf&two&依赖 +hdfs package&code&依赖 +hdfs package&package&GENERALIZATION +hdfs.common package&package&GENERALIZATION +default port number&server.namenode and server.datanode package&依赖 +NameNode and DataNode&default port number&依赖 +server&package&依赖 +server&package& +hdfs.common instead&namenode or datanode server&依赖 +server&dependency&依赖 +1.1.1.2 hfds.security security.token.delegation.DelegationTokenSecretManager&server.namenode.FSNameSystem&依赖 +security code&code&GENERALIZATION +security code&namenode&依赖 +security code&other server&依赖 +1.1.1.3 hdfs.protocol The class blocklistaslong&server.datanode module&依赖 +1.1.1.3 hdfs.protocol The class blocklistaslong&ReplicaInfo&依赖 +hdfs.protocol&server&依赖 +1.1.1.4 hdfs.server.protocol&server.common&依赖 +protocol&defined constant&依赖 +1.1.1.4 hdfs.server.protocol&two class&依赖 +1.1.1.4 hdfs.server.protocol&two class&依赖 +1.1.1.4 hdfs.server.protocol&server.common&依赖 +they&communication&依赖 +they&communication&依赖 +they&server&依赖 +they&server&依赖 +their&use& +hdfs.server.protocol&protocol message&依赖 +its&messages& +hdfs.server.protocol&code&依赖 +hdfs.server.protocol&class&依赖 +It&protocol&依赖 +It&class&依赖 +It&dependency&依赖 +1.1.1.5 server.common IncorrectVersionException and InconsistentFSStateException&server.protocol&依赖 +function ( jsphelper.sortnodelist )&relevant&依赖 +namenode package&package&GENERALIZATION +function ( jsphelper.sortnodelist )&it&依赖 +JspHelper&namenode&依赖 +it&other server&依赖 +function ( jsphelper.sortnodelist )&namenode package&依赖 +1.1.1.6 hdfs.server.namenode server.namenode&servlet&依赖 +class namenode.FSNameSystem&multiple cyclic dependency&依赖 +It&namenode.NameNode , namenode.FSNameSystemMetrics and namenode.LeaseManager&依赖 +It&direct cyclic dependency&依赖 +1.1.1.7 hdfs.server.datanode server.datanode&hdfs.DFSClient&依赖 +1.1.1.8 hdfs.server.balancer server.balancer&hdfs.DFSClient&依赖 +possibility&dependency&依赖 +possibility&dependency&依赖 +balancer&namenode&依赖 +namenode.UnsupportedActionException&namenode and balancer namenode.Namenode&依赖 +it&port number&依赖 +namenode.UnsupportedActionException&it&依赖 +namenode&number& +block placement policy&balancer&AGGREGATION +policy&namenode&AGGREGATION +block placement policy&policy&依赖 +block placement policy&namenode&依赖 +check&protocol message&依赖 +check&server.protcol&依赖 +class server.balancer.Balancer&several cyclic dependency&依赖 +they&same source file&依赖 +dependency structure&class&AGGREGATION +effect&dependency&AGGREGATION +different component&them&依赖 +1.1.1.9 hdfs.tools tool&different component&依赖 +couple&different component&AGGREGATION +1.1.1.9 hdfs.tools tool&couple&依赖 +different component&low coupling&依赖 +main domain&a filesystem ( debugging&AGGREGATION +it&sense&依赖 +user&convenience& diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c96078f1a870c4b04938b20d7c9470617c594a2 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM-relation.txt @@ -0,0 +1,359 @@ +platform or framework&big data problem&依赖 +You&suite&依赖 +number&service&AGGREGATION +suite&( ingesting&依赖 +suite&number&依赖 +suite&service&依赖 +You&it&依赖 +Hadoop component&Hadoop ecosystem&依赖 +backbone&Hadoop Ecosystem&AGGREGATION +one&large data set&依赖 +different type&large data set&AGGREGATION +one&i.e. structured , unstructured and semus structured datum )&依赖 +one&different type&依赖 +we&whole hdf&依赖 +hdf&level&依赖 +hdf&resource&依赖 +level&abstraction&AGGREGATION +we&single unit&依赖 +hdf&abstraction&依赖 +we&whole hdf&依赖 +we&single unit&依赖 +It&us&依赖 +our&data& +hdf&two core component&依赖 +hdf&i.e. namenode&依赖 +it&actual datum&依赖 +table&content&AGGREGATION +It&metada&依赖 +you&table&依赖 +you&content&依赖 +it&less storage&依赖 +datum&DataNodes&依赖 +datum&other hand&依赖 +it&more storage resource&依赖 +your&data& +datanode&distributed environment&依赖 +datanode&laptops and desktop&依赖 +your&laptops& +datanode&distributed environment&依赖 +You&namenode while&依赖 +it&request&依赖 +it&client&依赖 +YARN&brain&依赖 +YARN&YARN&依赖 +your&Ecosystem& +YARN&Hadoop Ecosystem&依赖 +brain&Hadoop Ecosystem&AGGREGATION +your&activities& +It&processing activity&依赖 +It&two major component&依赖 +It&i.e. resource manager&依赖 +processing department&department&GENERALIZATION +Resource Manager&processing department&依赖 +It&processing request&依赖 +actual processing&place&依赖 +part&request&AGGREGATION +Node manager&Data Node&依赖 +It&execution&依赖 +execution&task&AGGREGATION +It&task&依赖 +It&single Data Node&依赖 +scheduler&scheduling algorithm&依赖 +your&requirements& +while application manager&job submission&依赖 +core component&processing&AGGREGATION +ResourceManager&two component&依赖 +logic&processing&AGGREGATION +MapReduce&other word&依赖 +application&Hadoop environment&依赖 +application&large data set&依赖 +application&distributed and parallel algorithm&依赖 +map ( )&MapReduce program&依赖 +Map function&filter , group and sort&依赖 +Map function&action&依赖 +reduce function aggregate&reduce function aggregate&依赖 +us&above example&依赖 +better understanding&MapReduce program&AGGREGATION +We&student&依赖 +sample case&student&AGGREGATION +their&departments& +We&sample case&依赖 +We&department&依赖 +number&student&AGGREGATION +We&number&依赖 +Map program&student&依赖 +Map program&appearing&依赖 +key value pair&Reduce function&依赖 +total number&student&AGGREGATION +Reduce function&department&依赖 +APACHE PIG PIG&two part&依赖 +APACHE PIG PIG&Pig Latin&依赖 +You&it&依赖 +You&Java and JVM&依赖 +It&pig latin language&依赖 +pig latin language&SQL&依赖 +pig latin language&command structure&依赖 +10 line&pig latin =&AGGREGATION +back end&Pig job&AGGREGATION +200 line&Map-Reduce Java code&AGGREGATION +compiler&MapReduce&依赖 +compiler&pig latin&依赖 +It&sequential set&依赖 +It&MapReduce job&依赖 +sequential set&MapReduce job&AGGREGATION +PIG&Yahoo&依赖 +It&platform&依赖 +It&etl ( extract&依赖 +It&data flow&依赖 +load command&datum&依赖 +load command&PIG&依赖 +we&it&依赖 +we&various function&依赖 +you&screen&依赖 +you&hdf&依赖 +you&result&依赖 +you&datum&依赖 +you&datum&依赖 +APACHE HIVE Facebook&people&依赖 +APACHE HIVE Facebook&HIVE&依赖 +&large data set&依赖 +&SQL-like interface&依赖 +&distributed environment&依赖 +query language&Hive&AGGREGATION +It&Hive Command Line and JDBC/ODBC driver&依赖 +It&2 basic component&依赖 +it&purpose&依赖 +i.e. large datum set processing&i.e. large datum set processing&依赖 +i.e. large datum set processing&i.e. large datum set processing&依赖 +i.e. large datum set processing&purpose&依赖 +i.e. large datum set processing&i.e. large datum set processing&依赖 +i.e. large datum set processing&purpose&依赖 +i.e. large datum set processing&purpose&依赖 +primitive data type&SQL&AGGREGATION +It&SQL&依赖 +It&primitive data type&依赖 +your&needs& +You&predefined function&依赖 +Machine learning algorithm&self-learning machine&依赖 +Machine learning algorithm&us&依赖 +it&important future decision&依赖 +descendant artificial intelligence ( ai )&artificial intelligence ( ai )&AGGREGATION +what mahout&what mahout&依赖 +It&collaborative filter , clustering and classification&依赖 +Mahout&function& +us&1&依赖 +us&them&依赖 +their&patterns& +their&characteristics& +It&similar group&依赖 +similar group&datum&AGGREGATION +article&research papers etc&依赖 +It&datum&依赖 +It&datum&依赖 +article&blog&依赖 +object&which&依赖 +Frequent item&mahout check&依赖 +cell phone and cover&example&依赖 +you&cell phone&依赖 +It&library&依赖 +It&predefined set&依赖 +predefined set&different use case&依赖 +predefined set&different inbuilt algorithm&依赖 +predefined set&library&AGGREGATION +APACHE SPARK Apache Spark&distributed computing environment&依赖 +APACHE SPARK Apache Spark&real time data analytic&依赖 +University and Berkeley&California&AGGREGATION +Spark&Scala&依赖 +speed&data processing&AGGREGATION +It&in-memory computation&依赖 +it&Map-Reduce&依赖 +it&high processing power&依赖 +standard library&seamless integration&依赖 +standard library&complex workflow&依赖 +various set&service&AGGREGATION +its&capabilities& +it&capability&依赖 +it&various set&依赖 +it&service&依赖 +it&integrate&依赖 +apache spark best fit&real time processing&依赖 +apache spark best fit&real time processing&依赖 +apache spark best fit&real time processing&依赖 +Spark&ability& +it&best result&依赖 +Hadoop&operation& +their&Data& +it&other word&依赖 +It&datum&依赖 +type&datum&AGGREGATION +It&type&依赖 +It&’s bigtable&依赖 +Google&BigTable& +top&hdf&AGGREGATION +It&sparse datum&依赖 +It&fault tolerant way&依赖 +HBase application&REST , Avro and Thrift api&依赖 +HBase&Java&依赖 +us&example&依赖 +you&customer&依赖 +you&number&依赖 +who&email&依赖 +who&word complaint&依赖 +number&customer&AGGREGATION +You&customer email&依赖 +billion&customer email&AGGREGATION +You&billion&依赖 +their&emails& +we&set&依赖 +small amount&datum&AGGREGATION +we&large datum&依赖 +kind&problem&AGGREGATION +kind&datum&AGGREGATION +It&open source application&依赖 +It&Google Dremel&依赖 +replica&Google Dremel&AGGREGATION +powerful feature&Drill&AGGREGATION +It&different kinds NoSQL databases and file system&依赖 +petabytes and exabyte&data efficiently&AGGREGATION +you&minute )&依赖 +we&say&依赖 +we&petabytes and exabyte&依赖 +we&data efficiently&依赖 +variety&data store&AGGREGATION +main power&Apache Drill&AGGREGATION +ANSI SQL&SQL&GENERALIZATION +Apache Drill&ANSI SQL&依赖 +million&user&AGGREGATION +It&powerful scalability factor&依赖 +their&requests& +combination&various service&AGGREGATION +APACHE ZOOKEEPER Apache Zookeeper&Hadoop job&依赖 +Hadoop job&combination&依赖 +Hadoop job&various service&依赖 +s combation of various service&Hadoop Ecosystem&AGGREGATION +Apache Zookeeper&Zookeeper&GENERALIZATION +coordinator&Hadoop job&AGGREGATION +Apache Zookeeper&various service&依赖 +Apache Zookeeper&distributed environment&依赖 +it&Zookeeper&依赖 +service&common configuration while&依赖 +service&many problem&依赖 +service&interaction&依赖 +configuration&service&AGGREGATION +It&lot&依赖 +It&time&依赖 +lot&time&AGGREGATION +it&simple service&依赖 +APACHE OOZIE&Apache Oozie&依赖 +APACHE OOZIE&Hadoop Ecosystem&依赖 +APACHE OOZIE&clock and alarm service&依赖 +Oozie&Apache job&依赖 +It&Hadoop job&依赖 +two kind&Oozie job&AGGREGATION +sequential set&action&AGGREGATION +You&it&依赖 +You&relay race&依赖 +his&part& +athlete&last one&依赖 +athlete&part&依赖 +our&body& +Oozie coordinator&same manner&依赖 +Oozie coordinator&availability&依赖 +Oozie coordinator&datum&依赖 +we&external stimulus&依赖 +availability&datum&AGGREGATION +important part&Hadoop Ecosystem&AGGREGATION +our&Ecosystem& +APACHE FLUME Ingesting datum&Hadoop Ecosystem&依赖 +collect , aggregate and move large amount&data set&AGGREGATION +It&solution&依赖 +It&online streaming datum&依赖 +It&us&依赖 +Flume agent&streaming datum&依赖 +Flume agent&various data source&依赖 +architecture&Flume&AGGREGATION +us&architecture&依赖 +Flume agent&hdf&依赖 +us&Flume&依赖 +data source&source&GENERALIZATION +you&data source&依赖 +one&famous source&AGGREGATION +Twitter&streaming datum&依赖 +Twitter&famous source&依赖 +flume agent&source , sink and channel&依赖 +flume agent&3 component&依赖 +it&incoming streamline and store&依赖 +it&datum&依赖 +it&channel&依赖 +Channel&source&依赖 +source&datum&AGGREGATION +Channel&datum&依赖 +our last component i.e. sink&our last component i.e. sink&依赖 +our&component& +apache sqoop&flume and sqoop&依赖 +apache sqoop&major difference&依赖 +Flume&unstructured datum&依赖 +Flume&hdf&依赖 +we&Sqoop command&依赖 +Sqoop&diagram&依赖 +our&task& +sub task&datum&依赖 +sub task&part&依赖 +part&datum&AGGREGATION +sub task&Hadoop Ecosystem&依赖 +Map task&whole datum&依赖 +Export&similar manner&依赖 +chunk&datum&AGGREGATION +our&Job& +Map task&datum&依赖 +it&Map task&依赖 +Map task&chunk&依赖 +Map task&hdf&依赖 +we&Job&依赖 +chunk&structured data destination&依赖 +exported chunk&datum&AGGREGATION +most&case&AGGREGATION +we&whole datum&依赖 +we&destination&依赖 +Apache Lucene&Java&依赖 +It&search and full indexing&依赖 +It&Lucene Java search library&依赖 +It&core&依赖 +It&software&依赖 +It&Apache Hadoop cluster&依赖 +number&host&AGGREGATION +It&Hadoop service&依赖 +It&step process&依赖 +It&step&依赖 +It&number&依赖 +It&configuration&依赖 +It&Hadoop service&依赖 +configuration&Hadoop service&AGGREGATION +It&Hadoop service&依赖 +It&configuration&依赖 +service&user&依赖 +your&attention& +I&attention&依赖 +Hadoop Ecosystem&many big company&依赖 +Hadoop Ecosystem&Facebook , Google , Yahoo , University&依赖 +Hadoop Ecosystem&success&依赖 +Hadoop&capabilities& +its&success& +Hadoop Ecosystem&many big company&依赖 +Hadoop Ecosystem&success&依赖 +Facebook , Google , Yahoo , University berkeley ) etc.&california (&AGGREGATION +berkeley ) etc.&part&依赖 +their&part& +Hadoop Ecosystem&Facebook , Google , Yahoo , University&依赖 +knowledge&Hadoop Ecosystem&依赖 +knowledge&Hadoop Ecosystem&依赖 +knowledge&Hadoop Ecosystem&依赖 +You&set&依赖 +You&Hadoop component&依赖 +Hadoop component&solution&依赖 +set&Hadoop component&AGGREGATION +set&service&AGGREGATION +we&service&依赖 +we&Hadoop Ecosystem&依赖 +we&set&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..a531f53f956508fe7515e892a863d0283b38e1db --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide-relation.txt @@ -0,0 +1,548 @@ +The First Baby&File System Metadata The Communication Protocols Robustness Data Disk Failure&依赖 +Replica Selection Safemode The Persistence&File System Metadata The Communication Protocols Robustness Data Disk Failure&AGGREGATION +The First Baby&Replica Selection Safemode The Persistence&依赖 +It&many similarity&依赖 +It&distributed file system&依赖 +application&large data set&依赖 +hdf&few POSIX requirement&依赖 +hdf&infrastructure&依赖 +hdf&Apache Nutch web search engine project&依赖 +assumption and goal hardware failure hardware failure&exception&依赖 +HDFS instance&server machine&依赖 +hundreds or thousand&server machine&AGGREGATION +each store part&each store part&依赖 +system&data& +each store part&’s datum&依赖 +each store part&’s datum&AGGREGATION +HDFS instance&hundreds or thousand&依赖 +huge number&component&AGGREGATION +non-trivial probability&failure&AGGREGATION +component&failure&依赖 +component&non-trivial probability&依赖 +component&hdf&AGGREGATION +detection&hdf&依赖 +core architectural goal&hdf&AGGREGATION +detection&hdf&依赖 +detection&fault&AGGREGATION +their&data& +hdf&batch processing&依赖 +emphasis&low latency&依赖 +high throughput&data access&AGGREGATION +emphasis&data access&依赖 +low latency&data access&AGGREGATION +emphasis&data access&依赖 +POSIX&many hard requirement&依赖 +Large Data Sets application&large data set&依赖 +typical file&size&依赖 +typical file&size&依赖 +typical file&size&依赖 +typical file&terabyte&依赖 +typical file&terabyte&依赖 +typical file&size&依赖 +hundred&node&AGGREGATION +It&million&依赖 +It&ten&依赖 +It&file&依赖 +It&million&依赖 +It&file&依赖 +million&file&AGGREGATION +It&ten&依赖 +ten&million&AGGREGATION +Simple Coherency Model HDFS application&write-once-read-many access model&依赖 +Simple Coherency Model HDFS application&file&依赖 +assumption&data coherency issue&实现 +MapReduce application&application&GENERALIZATION +MapReduce application&model&依赖 +it&datum&依赖 +size&data set&AGGREGATION +network congestion&overall throughput&依赖 +network congestion&system&依赖 +overall throughput&system&AGGREGATION +platform&choice&AGGREGATION +widespread adoption&hdf&AGGREGATION +large set&application&AGGREGATION +master/slave architecture&architecture&GENERALIZATION +namenode and datanodes hdfs&master/slave architecture&依赖 +HDFS cluster&master server&依赖 +HDFS cluster&cluster&GENERALIZATION +HDFS cluster&single NameNode&依赖 +master server&file system namespace&依赖 +number&addition&依赖 +number&addition&依赖 +number&addition&依赖 +number&addition&依赖 +number&DataNodes&AGGREGATION +number&addition&依赖 +cluster&storage&依赖 +hdf&a file system namespace&依赖 +set&DataNodes&AGGREGATION +file&one or more block&依赖 +block&set&依赖 +block&DataNodes&依赖 +NameNode&file system namespace operation&依赖 +It&DataNodes&依赖 +mapping&block&AGGREGATION +It&mapping&依赖 +It&block&依赖 +system&clients& +DataNodes&block creation&依赖 +DataNodes&instruction&依赖 +DataNodes&NameNode&依赖 +hdf architecture namenode and datanode&software&依赖 +piece&software&AGGREGATION +machine&GNU/Linux operating system&依赖 +machine&Java&依赖 +DataNode software&software&GENERALIZATION +machine&NameNode&依赖 +Usage&portable Java language&AGGREGATION +wide range&machine&AGGREGATION +dedicated machine&machine&GENERALIZATION +NameNode software&software&GENERALIZATION +dedicated machine&NameNode software&依赖 +typical deployment&dedicated machine&依赖 +one instance&DataNode software&AGGREGATION +existence&single NameNode&AGGREGATION +existence&architecture&实现 +existence&system&实现 +architecture&system&AGGREGATION +NameNode&HDFS metada&依赖 +system&flows&依赖 +system&such a way&依赖 +user datum&NameNode&依赖 +File System Namespace hdf&traditional hierarchical file organization&依赖 +user&directory&依赖 +user&directory and store file&依赖 +one&file&依赖 +file system namespace hierarchy&most other existing file system&依赖 +hdf&user quota&实现 +hdf&hard link&依赖 +HDFS architecture&feature&实现 +HDFS architecture&architecture&GENERALIZATION +NameNode&file system namespace&依赖 +change&NameNode&依赖 +its&properties& +number&replica&AGGREGATION +application&number&依赖 +application&file&依赖 +replica&file&AGGREGATION +application&replica&依赖 +number&file&AGGREGATION +copy&file&AGGREGATION +number©&AGGREGATION +replication factor&file&AGGREGATION +information&NameNode&依赖 +It&file&依赖 +It&sequence&依赖 +sequence&block&AGGREGATION +It&block&依赖 +block&fault tolerance&依赖 +block&file&AGGREGATION +block size and replication factor&file&依赖 +replication&block&AGGREGATION +NameNode&replication&依赖 +NameNode&block&依赖 +NameNode&decision&依赖 +Receipt&Heartbeat&AGGREGATION +list&block&AGGREGATION +Blockreport&list&依赖 +Blockreport&DataNode&依赖 +Blockreport&block&依赖 +placement&replica&AGGREGATION +replica placement&hdf&依赖 +replica placement&most other distributed file system&依赖 +lot&tuning and experience&AGGREGATION +feature&lot&依赖 +feature&tuning and experience&依赖 +purpose&rack-aware replica placement policy&AGGREGATION +purpose&data reliability&依赖 +current implementation&direction&依赖 +current implementation&direction&依赖 +short-term goal&it&依赖 +its&behavior& +Large HDFS instance&cluster&依赖 +Large HDFS instance&computer&依赖 +cluster&computer&AGGREGATION +NameNode&rack id&依赖 +simple but non-optimal policy&replica&依赖 +entire rack&bandwidth&依赖 +entire rack&use&依赖 +entire rack&multiple rack&依赖 +use&bandwidth&AGGREGATION +policy&replica&依赖 +policy&cluster&依赖 +write&block&依赖 +policy&cost&依赖 +HDFS&policy& +’s placement policy&one replica&依赖 +’s placement policy&one node&依赖 +inter-rack write traffic&write performance&依赖 +policy&inter-rack write traffic&依赖 +chance&rack failure&AGGREGATION +policy&impact datum reliability and availability guarantee&依赖 +it&aggregate network bandwidth&依赖 +datum&three&依赖 +datum&two unique rack&依赖 +replica&rack&依赖 +other third&rack&依赖 +two third&replica&AGGREGATION +One third&replica&AGGREGATION +policy&performance&依赖 +current , default replica placement policy&progress&依赖 +current , default replica placement policy&progress&依赖 +hdf&replica&依赖 +Replica Selection&global bandwidth consumption&依赖 +hdf&read request&依赖 +replica&remote replica&依赖 +NameNode&special state&依赖 +NameNode&special state&依赖 +Replication&data block&AGGREGATION +NameNode&Heartbeat and Blockreport message&依赖 +NameNode&DataNodes&依赖 +Blockreport&data block&依赖 +list&data block&AGGREGATION +Blockreport&hosting&依赖 +block&replica&依赖 +block&specified minimum number&依赖 +specified minimum number&replica&AGGREGATION +data block&block&GENERALIZATION +replica&data block&AGGREGATION +minimum number&replica&AGGREGATION +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +It&list&依赖 +It&data block&依赖 +It&)&依赖 +specified number&replica&AGGREGATION +NameNode&block&依赖 +NameNode&other datanode&依赖 +HDFS namespace&NameNode&依赖 +Persistence&File System Metadata&AGGREGATION +NameNode&transaction log&依赖 +NameNode&EditLog&依赖 +NameNode&system metada&依赖 +NameNode&file&依赖 +NameNode&local host OS file system&依赖 +its&system& +entire file system namespace&file&依赖 +NameNode&system& +FsImage&’s local file system&依赖 +FsImage&file&依赖 +NameNode&memory&依赖 +NameNode&entire file system namespace and file blockmap&依赖 +image&entire file system namespace and file blockmap&AGGREGATION +4 GB&RAM&AGGREGATION +huge number&files and directory&AGGREGATION +in-memory representation&FsImage&AGGREGATION +it&FsImage and EditLog&依赖 +it&disk&依赖 +It&old EditLog&依赖 +transaction&persistent FsImage&依赖 +its&transactions& +checkpoint¤t implementation&依赖 +Work&periodic checkpointing&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +DataNode&knowledge&依赖 +DataNode&HDFS file&依赖 +It&HDFS datum&依赖 +It&block&依赖 +It&HDFS datum&依赖 +block&HDFS datum&AGGREGATION +It&block&依赖 +It&block&依赖 +It&HDFS datum&依赖 +DataNode&file&依赖 +DataNode&same directory&依赖 +optimal number&file&AGGREGATION +it&heuristic&依赖 +It&local file&依赖 +local file system&single directory&依赖 +local file system&huge number&依赖 +huge number&file&AGGREGATION +It&same directory&依赖 +list&HDFS data block&AGGREGATION +it&local file system&依赖 +Communication Protocols All HDFS communication protocol&TCP/IP protocol&依赖 +Communication Protocols All HDFS communication protocol&top&依赖 +top&TCP/IP protocol&AGGREGATION +client&configurable TCP port&依赖 +client&NameNode machine&依赖 +NameNode machine&machine&GENERALIZATION +client&connection&依赖 +It&NameNode&依赖 +It&ClientProtocol&依赖 +DataNodes&DataNode Protocol&依赖 +DataNodes&NameNode&依赖 +( rpc ) abstraction&Client Protocol&依赖 +NameNode&rpc&依赖 +NameNode&design&依赖 +it&RPC request&依赖 +robustness primary objective&hdf&AGGREGATION +robustness primary objective&datum&依赖 +presence&failure&AGGREGATION +three common type&failure&AGGREGATION +Data Disk Failure&NameNode&依赖 +Data Disk Failure&Heartbeat message&依赖 +network partition&subset&依赖 +network partition&DataNodes&依赖 +subset&DataNodes&AGGREGATION +NameNode&condition&依赖 +NameNode&absence&依赖 +NameNode&Heartbeat message&依赖 +absence&Heartbeat message&AGGREGATION +NameNode mark&recent heartbeat&依赖 +datum&hdf&依赖 +DataNode death&block&依赖 +DataNode death&replication factor&依赖 +replication factor&block&AGGREGATION +their&value& +HDFS architecture&data rebalancing scheme&依赖 +scheme&one DataNode&依赖 +scheme&datum&依赖 +free space&certain threshold&依赖 +scheme&one DataNode to ###&依赖 +free space&certain threshold&依赖 +scheme&additional replica&依赖 +scheme&additional replica&依赖 +scheme&particular file&依赖 +event&sudden high demand&AGGREGATION +scheme&sudden high demand&依赖 +type&data rebalancing scheme&AGGREGATION +block&datum&AGGREGATION +corruption&fault&依赖 +corruption&storage device&依赖 +checksum checking&HDFS file&依赖 +contents&HDFS file&AGGREGATION +checksum checking&contents&依赖 +checksum checking&HDFS file&依赖 +checksum checking&contents&依赖 +checksum checking&HDFS file&依赖 +checksum checking&contents&依赖 +HDFS file&file&GENERALIZATION +it&separate hidden file&依赖 +it&checksum&依赖 +it&block&依赖 +block&file and store&AGGREGATION +checksum&block&AGGREGATION +client&HDFS file&依赖 +it&file and store&依赖 +file contents&contents&GENERALIZATION +it&checksum&依赖 +client&file contents&依赖 +replica&block&AGGREGATION +DataNode&replica&依赖 +DataNode&block&依赖 +central data structure&hdf&AGGREGATION +Metadata Disk Failure The FsImage&hdf&依赖 +corruption&file&AGGREGATION +corruption&HDFS instance&依赖 +multiple copy&FsImage and EditLog&AGGREGATION +update&updated synchronously&依赖 +synchronous update&rate&依赖 +rate&namespace transaction&AGGREGATION +synchronous update&second&依赖 +synchronous update&namespace transaction&依赖 +synchronous update&multiple copy&AGGREGATION +it&latest consistent fsimage&依赖 +it&use&依赖 +NameNode machine&HDFS cluster&依赖 +NameNode machine&failure&依赖 +single point&failure&AGGREGATION +automatic restart and failover&NameNode software&AGGREGATION +particular instant&time&AGGREGATION +copy&datum&AGGREGATION +snapshot snapshot©&依赖 +snapshot snapshot&support&依赖 +snapshot snapshot&datum&依赖 +One usage&corrupted HDFS instance&依赖 +One usage&snapshot feature&AGGREGATION +hdf&snapshot&依赖 +application&datum&依赖 +they&one or more time&依赖 +hdf&write-once-read-many semantics&依赖 +hdf&file&依赖 +chunk&different DataNode&依赖 +HDFS client&file datum&依赖 +HDFS client&temporary local file&依赖 +HDFS client&client&GENERALIZATION +HDFS client&fact&依赖 +Application write&temporary local file&依赖 +client contact&NameNode&依赖 +local file&datum worth&依赖 +local file&one HDFS block size&依赖 +client contact&NameNode&依赖 +client contact&NameNode&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +NameNode&DataNode&依赖 +NameNode&identity&依赖 +identity&DataNode&AGGREGATION +NameNode&client request&依赖 +client&datum&依赖 +client&block&依赖 +client&datum&依赖 +client&datum&依赖 +client&block&依赖 +client&block&依赖 +un-flushed datum&DataNode&依赖 +client&NameNode&依赖 +NameNode&persistent store&依赖 +NameNode&point&依赖 +NameNode&file creation operation&依赖 +careful consideration&target application&AGGREGATION +above approach&target application&依赖 +above approach&careful consideration&依赖 +application&streaming write&依赖 +application&file&依赖 +network speed&writes&依赖 +client&client side buffering&依赖 +client&remote file&依赖 +network speed&network impact throughput&依赖 +Earlier distributed file system&client side caching&依赖 +Earlier distributed file system&client side caching&依赖 +higher performance&data upload&AGGREGATION +POSIX requirement&data upload&依赖 +POSIX requirement&higher performance&依赖 +client&datum&依赖 +client&HDFS file&依赖 +its&data& +datum&local file&依赖 +HDFS file&replication factor&依赖 +replication factor&three&AGGREGATION +HDFS file&three&依赖 +local file&user datum&依赖 +full block&user datum&AGGREGATION +local file&full block&依赖 +client&list&依赖 +list&DataNodes&AGGREGATION +client&NameNode&依赖 +DataNodes&replica&依赖 +DataNodes&block&依赖 +list&DataNodes&依赖 +client&first DataNode&依赖 +client&data block&依赖 +its&repository& +first DataNode&datum&依赖 +turn start&portion&依赖 +portion&data block&AGGREGATION +second DataNode&portion&依赖 +turn start&data block&依赖 +second DataNode&portion&依赖 +third DataNode&datum&依赖 +third DataNode&local repository&依赖 +DataNode&previous one&依赖 +DataNode&datum&依赖 +DataNode&pipeline&依赖 +datum&one DataNode&依赖 +datum&next&依赖 +Accessibility hdf&many different way&依赖 +Accessibility hdf&application&依赖 +file&HDFS instance&AGGREGATION +FS Shell HDFS&user datum&依赖 +form&files and directory&AGGREGATION +FS shell&user interact&依赖 +FS shell&datum&依赖 +syntax&command set&AGGREGATION +Action Command&directory&依赖 +txt FS shell&application&依赖 +cat / foodir/myfile&language&依赖 +Action Command&/ foodir bin/hadoop dfs&依赖 +contents&file&AGGREGATION +Action Command&Safemode bin/hadoop dfsadmin&依赖 +list&DataNodes bin/hadoop dfsadmin&AGGREGATION +refreshnodes browser interface a typical hdf&web server&依赖 +safemode&Generate&依赖 +list&refreshnodes browser interface a typical hdf&依赖 +Action Command&cluster&依赖 +its&files& +file&user&依赖 +it&hdf&依赖 +hdf first&/ trash directory&依赖 +hdf first&it&依赖 +hdf first&file&依赖 +it&/ trash&依赖 +file&configurable amount&依赖 +configurable amount&time&AGGREGATION +file&/ trash&依赖 +file&time&依赖 +expiry&life&AGGREGATION +NameNode&/ trash&依赖 +NameNode&file&依赖 +NameNode&HDFS namespace&依赖 +NameNode&file&依赖 +its&life& +deletion&block&依赖 +deletion&file&AGGREGATION +time&corresponding increase&AGGREGATION +user&file&依赖 +it&/ trash directory&依赖 +user&file&依赖 +he/she&that&依赖 +he/she&/ trash directory&依赖 +/ trash directory&file&依赖 +/ trash directory&latest copy&依赖 +latest copy&file&AGGREGATION +hdf&directory&依赖 +/ trash directory&one special feature&依赖 +hdf&policy&依赖 +hdf&file&依赖 +current default policy&/ trash&依赖 +current default policy&file&依赖 +policy&future&依赖 +policy&defined interface&依赖 +NameNode&excess replica&依赖 +next heartbeat transfer&information&依赖 +corresponding free space&cluster&依赖 +DataNode&corresponding block&依赖 +completion&setReplication API call&AGGREGATION +appearance&free space&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab816b834fb8b000df29d5bc2a36a1c8de6a6e8d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture-relation.txt @@ -0,0 +1,561 @@ +It&many similarity&依赖 +It&distributed file system&依赖 +application&large data set&依赖 +hdf&few POSIX requirement&依赖 +hdf&infrastructure&依赖 +hdf&Apache Nutch web search engine project&依赖 +part&Apache Hadoop Core project&AGGREGATION +hdf&Apache Hadoop Core project&依赖 +assumption and goal hardware failure hardware failure&exception&依赖 +HDFS instance&server machine&依赖 +hundreds or thousand&server machine&AGGREGATION +each store part&each store part&依赖 +system&data& +each store part&’s datum&依赖 +each store part&’s datum&AGGREGATION +HDFS instance&hundreds or thousand&依赖 +huge number&component&AGGREGATION +non-trivial probability&failure&AGGREGATION +component&failure&依赖 +component&non-trivial probability&依赖 +component&hdf&AGGREGATION +detection&hdf&依赖 +core architectural goal&hdf&AGGREGATION +detection&hdf&依赖 +detection&fault&AGGREGATION +their&data& +hdf&batch processing&依赖 +emphasis&low latency&依赖 +high throughput&data access&AGGREGATION +emphasis&data access&依赖 +low latency&data access&AGGREGATION +emphasis&data access&依赖 +POSIX&many hard requirement&依赖 +Large Data Sets application&large data set&依赖 +typical file&size&依赖 +typical file&size&依赖 +typical file&size&依赖 +typical file&terabyte&依赖 +typical file&terabyte&依赖 +typical file&size&依赖 +hundred&node&AGGREGATION +It&million&依赖 +It&ten&依赖 +It&file&依赖 +It&million&依赖 +It&file&依赖 +million&file&AGGREGATION +It&ten&依赖 +ten&million&AGGREGATION +Simple Coherency Model HDFS application&write-once-read-many access model&依赖 +Simple Coherency Model HDFS application&file&依赖 +end&file&AGGREGATION +assumption&data coherency issue&实现 +MapReduce application&application&GENERALIZATION +MapReduce application&model&依赖 +it&datum&依赖 +size&data set&AGGREGATION +network congestion&overall throughput&依赖 +network congestion&system&依赖 +overall throughput&system&AGGREGATION +platform&choice&AGGREGATION +widespread adoption&hdf&AGGREGATION +large set&application&AGGREGATION +master/slave architecture&architecture&GENERALIZATION +namenode and datanodes hdfs&master/slave architecture&依赖 +HDFS cluster&master server&依赖 +HDFS cluster&cluster&GENERALIZATION +HDFS cluster&single NameNode&依赖 +master server&file system namespace&依赖 +number&addition&依赖 +number&addition&依赖 +number&addition&依赖 +number&addition&依赖 +number&DataNodes&AGGREGATION +number&addition&依赖 +cluster&storage&依赖 +hdf&a file system namespace&依赖 +set&DataNodes&AGGREGATION +file&one or more block&依赖 +block&set&依赖 +block&DataNodes&依赖 +NameNode&file system namespace operation&依赖 +It&DataNodes&依赖 +mapping&block&AGGREGATION +It&mapping&依赖 +It&block&依赖 +system&clients& +DataNodes&block creation&依赖 +DataNodes&instruction&依赖 +DataNodes&NameNode&依赖 +NameNode and DataNode&software&依赖 +piece&software&AGGREGATION +machine&GNU/Linux operating system&依赖 +machine&Java&依赖 +DataNode software&software&GENERALIZATION +machine&NameNode&依赖 +Usage&portable Java language&AGGREGATION +wide range&machine&AGGREGATION +dedicated machine&machine&GENERALIZATION +NameNode software&software&GENERALIZATION +dedicated machine&NameNode software&依赖 +typical deployment&dedicated machine&依赖 +one instance&DataNode software&AGGREGATION +existence&single NameNode&AGGREGATION +existence&architecture&实现 +existence&system&实现 +architecture&system&AGGREGATION +NameNode&HDFS metada&依赖 +system&flows&依赖 +system&such a way&依赖 +user datum&NameNode&依赖 +File System Namespace hdf&traditional hierarchical file organization&依赖 +user&directory&依赖 +user&directory and store file&依赖 +one&file&依赖 +file system namespace hierarchy&most other existing file system&依赖 +hdf&user quota&依赖 +hdf&hard link&依赖 +HDFS architecture&feature&实现 +HDFS architecture&architecture&GENERALIZATION +convention&FileSystem&AGGREGATION +feature&reserved path&依赖 +feature&reserved path&依赖 +NameNode&file system namespace&依赖 +change&NameNode&依赖 +its&properties& +number&replica&AGGREGATION +application&number&依赖 +application&file&依赖 +replica&file&AGGREGATION +application&replica&依赖 +number&file&AGGREGATION +copy&file&AGGREGATION +number©&AGGREGATION +replication factor&file&AGGREGATION +information&NameNode&依赖 +It&file&依赖 +It&sequence&依赖 +sequence&block&AGGREGATION +It&block&依赖 +block&fault tolerance&依赖 +block&file&AGGREGATION +block size and replication factor&file&依赖 +user&new block&依赖 +support&append and hsync&依赖 +replication&block&AGGREGATION +NameNode&replication&依赖 +NameNode&block&依赖 +NameNode&decision&依赖 +Receipt&Heartbeat&AGGREGATION +list&block&AGGREGATION +Blockreport&list&依赖 +Blockreport&DataNode&依赖 +Blockreport&block&依赖 +placement&replica&AGGREGATION +replica placement&hdf&依赖 +replica placement&most other distributed file system&依赖 +lot&tuning and experience&AGGREGATION +feature&lot&依赖 +feature&tuning and experience&依赖 +purpose&rack-aware replica placement policy&AGGREGATION +purpose&data reliability&依赖 +current implementation&direction&依赖 +current implementation&direction&依赖 +short-term goal&it&依赖 +its&behavior& +Large HDFS instance&cluster&依赖 +Large HDFS instance&computer&依赖 +cluster&computer&AGGREGATION +NameNode&rack id&依赖 +simple but non-optimal policy&replica&依赖 +entire rack&bandwidth&依赖 +entire rack&use&依赖 +entire rack&multiple rack&依赖 +use&bandwidth&AGGREGATION +policy&replica&依赖 +policy&cluster&依赖 +write&block&依赖 +policy&cost&依赖 +HDFS&policy& +writer&same rack as ###&依赖 +’s placement policy&one replica&依赖 +writer&random datanode&依赖 +writer&same rack&依赖 +inter-rack write traffic&write performance&依赖 +policy&inter-rack write traffic&依赖 +chance&rack failure&AGGREGATION +policy&impact datum reliability and availability guarantee&依赖 +it&aggregate network bandwidth&依赖 +datum&three&依赖 +datum&two unique rack&依赖 +replica&block&AGGREGATION +replica&rack&依赖 +Two replica&one rack&依赖 +one&other rack&AGGREGATION +replica&one&依赖 +different node&one rack&AGGREGATION +replica&other rack&依赖 +node&one&AGGREGATION +policy&performance&依赖 +placement&4th and following replica&AGGREGATION +maximum number&replica&AGGREGATION +maximum number&datanode&依赖 +maximum number&time&依赖 +multiple replica&same block&AGGREGATION +NameNode&same block&依赖 +maximum number&time&依赖 +maximum number&time&依赖 +total number&datanode&AGGREGATION +maximum number&time&依赖 +maximum number&datanode&依赖 +NameNode&DataNodes&依赖 +maximum number&time&依赖 +maximum number&datanode&依赖 +NameNode&multiple replica&依赖 +maximum number&time&依赖 +NameNode&addition&依赖 +NameNode&account&依赖 +support&hdf&依赖 +NameNode&rack awareness&依赖 +NameNode&policy&依赖 +NameNode&policy&依赖 +NameNode&node&依赖 +NameNode&node&依赖 +NameNode&node&依赖 +candidate node&storage&依赖 +candidate node&node&GENERALIZATION +NameNode&node&依赖 +candidate node&storage type&依赖 +NameNode&second path&依赖 +enough node&first path&依赖 +NameNode&fallback storage type&依赖 +current , default replica placement policy&progress&依赖 +current , default replica placement policy&progress&依赖 +hdf&replica&依赖 +Replica Selection&global bandwidth consumption&依赖 +hdf&read request&依赖 +HDFS cluster&multiple data center&依赖 +replica&remote replica&依赖 +Additional&4 different pluggable Block Placement policy&依赖 +their&infrastructre& +user&policy&依赖 +default hdf&BlockPlacementPolicyDefault&依赖 +default hdf&default hdf&依赖 +NameNode&special state&依赖 +NameNode&special state&依赖 +Replication&data block&AGGREGATION +NameNode&Heartbeat and Blockreport message&依赖 +NameNode&DataNodes&依赖 +Blockreport&data block&依赖 +list&data block&AGGREGATION +Blockreport&hosting&依赖 +block&replica&依赖 +block&specified minimum number&依赖 +specified minimum number&replica&AGGREGATION +data block&block&GENERALIZATION +replica&data block&AGGREGATION +minimum number&replica&AGGREGATION +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&namenode (&依赖 +namenode exit&Safemode state&依赖 +namenode exit&safely replicate datum block check&依赖 +namenode exit&additional 30 second&依赖 +namenode exit&Safemode state&依赖 +It&list&依赖 +It&data block&依赖 +It&)&依赖 +specified number&replica&AGGREGATION +NameNode&block&依赖 +NameNode&other datanode&依赖 +HDFS namespace&NameNode&依赖 +Persistence&File System Metadata&AGGREGATION +NameNode&transaction log&依赖 +NameNode&EditLog&依赖 +NameNode&system metada&依赖 +NameNode&file&依赖 +NameNode&local host OS file system&依赖 +its&system& +entire file system namespace&file&依赖 +NameNode&system& +FsImage&’s local file system&依赖 +FsImage&file&依赖 +NameNode&memory&依赖 +NameNode&entire file system namespace and file blockmap&依赖 +image&entire file system namespace and file blockmap&AGGREGATION +checkpoint&configurable threshold&依赖 +in-memory representation&FsImage&AGGREGATION +it&FsImage and EditLog&依赖 +it&disk&依赖 +It&old EditLog&依赖 +transaction&persistent FsImage&依赖 +its&transactions& +purpose&checkpoint&AGGREGATION +hdf&file system metada&依赖 +consistent view&file system metada&AGGREGATION +hdf&consistent view&依赖 +snapshot&file system metada&AGGREGATION +it&incremental edit&依赖 +it&FsImage&依赖 +we&edit&依赖 +we&Editlog&依赖 +change&checkpoint&依赖 +change&FsImage&依赖 +given number&filesystem transaction&AGGREGATION +given number&( dfs.namenode.checkpoint.txns )&依赖 +first threshold&checkpoint&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +DataNode&knowledge&依赖 +DataNode&HDFS file&依赖 +It&HDFS datum&依赖 +It&block&依赖 +It&HDFS datum&依赖 +block&HDFS datum&AGGREGATION +It&block&依赖 +It&block&依赖 +It&HDFS datum&依赖 +DataNode&file&依赖 +DataNode&same directory&依赖 +optimal number&file&AGGREGATION +it&heuristic&依赖 +It&local file&依赖 +local file system&file&依赖 +local file system&huge number&依赖 +huge number&file&AGGREGATION +It&same directory&依赖 +local file system&single directory&依赖 +list&HDFS data block&AGGREGATION +it&local file system&依赖 +Communication Protocols All HDFS communication protocol&TCP/IP protocol&依赖 +Communication Protocols All HDFS communication protocol&top&依赖 +top&TCP/IP protocol&AGGREGATION +client&configurable TCP port&依赖 +client&NameNode machine&依赖 +NameNode machine&machine&GENERALIZATION +client&connection&依赖 +It&NameNode&依赖 +It&ClientProtocol&依赖 +DataNodes&DataNode Protocol&依赖 +DataNodes&NameNode&依赖 +( rpc ) abstraction&Client Protocol&依赖 +NameNode&rpc&依赖 +NameNode&design&依赖 +it&RPC request&依赖 +robustness primary objective&hdf&AGGREGATION +robustness primary objective&datum&依赖 +presence&failure&AGGREGATION +three common type&failure&AGGREGATION +Data Disk Failure&NameNode&依赖 +Data Disk Failure&Heartbeat message&依赖 +network partition&subset&依赖 +network partition&DataNodes&依赖 +subset&DataNodes&AGGREGATION +NameNode&condition&依赖 +NameNode&absence&依赖 +NameNode&Heartbeat message&依赖 +absence&Heartbeat message&AGGREGATION +NameNode mark&recent heartbeat&依赖 +datum&hdf&依赖 +DataNode death&block&依赖 +DataNode death&replication factor&依赖 +replication factor&block&AGGREGATION +their&value& +state flap&DataNodes&AGGREGATION +user&shorter interval&依赖 +HDFS architecture&data rebalancing scheme&依赖 +scheme&one DataNode&依赖 +scheme&datum&依赖 +free space&certain threshold&依赖 +scheme&one DataNode to ###&依赖 +free space&certain threshold&依赖 +scheme&additional replica&依赖 +scheme&additional replica&依赖 +scheme&particular file&依赖 +event&sudden high demand&AGGREGATION +scheme&sudden high demand&依赖 +type&data rebalancing scheme&AGGREGATION +block&datum&AGGREGATION +corruption&fault&依赖 +corruption&storage device&依赖 +checksum checking&HDFS file&依赖 +contents&HDFS file&AGGREGATION +checksum checking&contents&依赖 +checksum checking&HDFS file&依赖 +checksum checking&contents&依赖 +checksum checking&HDFS file&依赖 +checksum checking&contents&依赖 +HDFS file&file&GENERALIZATION +it&separate hidden file&依赖 +it&checksum&依赖 +it&block&依赖 +block&file and store&AGGREGATION +checksum&block&AGGREGATION +client&HDFS file&依赖 +it&file and store&依赖 +file contents&contents&GENERALIZATION +it&checksum&依赖 +client&file contents&依赖 +DataNode&replica&依赖 +DataNode&block&依赖 +central data structure&hdf&AGGREGATION +Metadata Disk Failure The FsImage&hdf&依赖 +corruption&file&AGGREGATION +corruption&HDFS instance&依赖 +multiple copy&FsImage and EditLog&AGGREGATION +update&updated synchronously&依赖 +synchronous update&rate&依赖 +rate&namespace transaction&AGGREGATION +synchronous update&support&依赖 +synchronous update&multiple copy&AGGREGATION +it&latest consistent fsimage&依赖 +it&use&依赖 +High Availability&shared storage&依赖 +High Availability&nf&依赖 +High Availability&multiple namenode&依赖 +particular instant&time&AGGREGATION +copy&datum&AGGREGATION +snapshot snapshot©&依赖 +snapshot snapshot&support&依赖 +snapshot snapshot&datum&依赖 +One usage&corrupted HDFS instance&依赖 +One usage&snapshot feature&AGGREGATION +application&datum&依赖 +they&one or more time&依赖 +hdf&write-once-read-many semantics&依赖 +hdf&file&依赖 +chunk&different DataNode&依赖 +NameNode&list&依赖 +NameNode&datanode&依赖 +Replication Pipelining&replication factor&依赖 +client&datum&依赖 +client&three&依赖 +NameNode&a replication target choose algorithm&依赖 +client&datum&依赖 +client&replication factor&依赖 +Replication Pipelining&three&依赖 +replication factor&three&AGGREGATION +list&datanode&AGGREGATION +DataNodes&replica&依赖 +DataNodes&block&依赖 +list&DataNodes&依赖 +client&first DataNode&依赖 +its&repository& +first DataNode&datum&依赖 +turn start&portion&依赖 +portion&data block&AGGREGATION +second DataNode&portion&依赖 +turn start&data block&依赖 +second DataNode&portion&依赖 +third DataNode&datum&依赖 +third DataNode&local repository&依赖 +DataNode&previous one&依赖 +DataNode&datum&依赖 +DataNode&pipeline&依赖 +datum&one DataNode&依赖 +datum&next&依赖 +Accessibility hdf&many different way&依赖 +Accessibility hdf&application&依赖 +file&HDFS instance&AGGREGATION +hdf&part&依赖 +client&system& +hdf&’s local file system&依赖 +part&’s local file system&AGGREGATION +FS Shell HDFS&user datum&依赖 +form&files and directory&AGGREGATION +FS shell&user interact&依赖 +FS shell&datum&依赖 +syntax&command set&AGGREGATION +Action Command&directory&依赖 +Action Command&/ foodir bin/hadoop fs&依赖 +txt FS shell&application&依赖 +cat / foodir/myfile&language&依赖 +contents&file&AGGREGATION +report Recommission or decommission DataNode&) bin/hdfs dfsadmin&依赖 +refreshnodes browser interface a typical hdf&HDFS namespace&依赖 +list&DataNodes bin/hdfs dfsadmin&AGGREGATION +refreshnodes browser interface a typical hdf&web server&依赖 +Action Command&cluster&依赖 +Action Command&Safemode bin/hdfs dfsadmin&依赖 +its&files& +file&hdf&依赖 +user&/ user / /&依赖 +its&directory& +hdf&a trash directory (&依赖 +user&own trash directory&依赖 +hdf&it&依赖 +it&trash&依赖 +Most recent deleted file¤t trash directory ( / user / /&依赖 +hdf&checkpoint&依赖 +hdf&( under / user&依赖 +checkpointing&trash&AGGREGATION +expunge command&FS shell&AGGREGATION +expiry&life&AGGREGATION +NameNode&file&依赖 +NameNode&HDFS namespace&依赖 +NameNode&file&依赖 +its&life& +NameNode&trash&依赖 +deletion&block&依赖 +deletion&file&AGGREGATION +file&user&依赖 +time&corresponding increase&AGGREGATION +We&test1 & test2 )&依赖 +We&2 file test1 & test2 )&依赖 +We&file test1&依赖 +we&file&依赖 +Trash/Current&hdf&依赖 +skipTrash option&file&依赖 +we&skipTrash option&依赖 +skipTrash option&Trash.It&依赖 +We&file test1&依赖 +NameNode&excess replica&依赖 +next heartbeat transfer&information&依赖 +corresponding free space&cluster&依赖 +DataNode&corresponding block&依赖 +completion&setReplication API call&AGGREGATION +appearance&free space&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..a41386e40f8f90ec0b8a15261f81c4fec2597db8 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS-relation.txt @@ -0,0 +1,204 @@ +number&separate machine&AGGREGATION +dataset&single physical machine&依赖 +storage capacity&single physical machine&AGGREGATION +dataset&storage capacity&依赖 +it&it&依赖 +network&machine&AGGREGATION +separate machine&storage&依赖 +Hadoop&distributed filesystem&依赖 +Design&hdf&AGGREGATION +cluster&commodity hardware&AGGREGATION +“ Very large ”&file&依赖 +hundred&megabytes , gigabytes , or terabyte&AGGREGATION +“ Very large ”&file&依赖 +store petabyte&datum&AGGREGATION +Hadoop cluster&store petabyte&依赖 +Hadoop cluster&datum&依赖 +hdf&idea&依赖 +dataset&source&依赖 +various analysis&dataset over time&依赖 +Hadoop&expensive , highly reliable hardware&依赖 +Hadoop&run&依赖 +chance&large cluster&依赖 +chance&which&依赖 +cluster&commodity hardware ( commonly available hardware&AGGREGATION +chance&which&依赖 +chance&node failure&AGGREGATION +chance&large cluster&依赖 +chance&large cluster&依赖 +chance&which&依赖 +face&such failure&AGGREGATION +ten&milliseconds range&AGGREGATION +namenode&filesystem metada&依赖 +lot&small file&AGGREGATION +limit&namenode&依赖 +limit&amount&依赖 +number&file&AGGREGATION +amount&memory&AGGREGATION +namenode&memory&依赖 +file&single writer&依赖 +write&end&依赖 +write&file&依赖 +end&file&AGGREGATION +hdf&concept&依赖 +hdf&block&依赖 +concept&block&AGGREGATION +file&block-sized chunk&依赖 +several benefits.&block abstraction&依赖 +several benefits.&distributed filesystem&依赖 +’s nothing&block&依赖 +’s nothing&a file&依赖 +they&advantage&依赖 +they&advantage of ###&依赖 +unit&abstraction a block&AGGREGATION +storage subsystem deal&storage management (&实现 +storage subsystem deal&storage management (&实现 +block&replication&依赖 +block&separate machine&依赖 +small number&separate machine and typically three )&AGGREGATION +block&small number&依赖 +block&typically three )&依赖 +reason&seek&依赖 +cost&seek&AGGREGATION +reason&cost&依赖 +time&block&依赖 +time&start&依赖 +start&block&AGGREGATION +time&disk transfer rate&依赖 +time&disk transfer rate&依赖 +time&disk transfer rate&依赖 +seek time&transfer time&AGGREGATION +many HDFS installation&128 MB block&依赖 +transfer speed&disk drive&依赖 +transfer speed&new generation&依赖 +new generation&disk drive&AGGREGATION +HDFS cluster&node operating&依赖 +HDFS cluster&namenode&依赖 +HDFS cluster&master-worker pattern&依赖 +number&datanode ( worker&AGGREGATION +two type&node operating&AGGREGATION +HDFS cluster&cluster&GENERALIZATION +HDFS cluster&two type&依赖 +namenode&filesystem namespace&依赖 +It&filesystem tree&依赖 +information&form&依赖 +information&local disk&依赖 +information&two file&依赖 +information&namespace image&依赖 +form&two file&AGGREGATION +block&datanode&依赖 +namenode&datanode&依赖 +Hadoop cluster&cluster&GENERALIZATION +one primary component&hadoop cluster and hdf&AGGREGATION +TaskTraker } hdf&hadoop cluster and hdf&依赖 +mapping&block&AGGREGATION +master ( namenode )&file system namespace operation&依赖 +system&clients& +file system&system&GENERALIZATION +datanode&filesystem&依赖 +workhorse&filesystem&AGGREGATION +list&block&AGGREGATION +they&block&依赖 +They&block&依赖 +they&list&依赖 +they&storing&依赖 +what precaution hdf&file system&依赖 +persistent state&filesystem metada&AGGREGATION +what precaution hdf&what precaution hdf&依赖 +namenode failure&persistent state&依赖 +namenode failure&filesystem metada&依赖 +case&namenode failure&AGGREGATION +first way&file&依赖 +namenode&persistent state&依赖 +namenode&multiple filesystem&依赖 +its&state& +It&secondary namenode&依赖 +its&name& +main role&namespace image&依赖 +Its&role& +namenode&file and block&依赖 +namenode&filesystem&依赖 +namenode&reference&依赖 +namenode&memory&依赖 +portion&filesystem namespace&AGGREGATION +HDFS Federation&cluster&依赖 +HDFS Federation&scale&依赖 +filesystem namespace&namespace&GENERALIZATION +HDFS Federation&cluster&依赖 +HDFS Federation&scale&依赖 +one namenode&file&依赖 +one namenode&file&依赖 +second namenode&/ share&依赖 +second namenode&file&依赖 +namenode&one another&依赖 +failure&one namenode&AGGREGATION +availability&namespace&AGGREGATION +failure&namespace&依赖 +failure&availability&依赖 +so datanodes register&multiple block pool&依赖 +so datanodes register&multiple block pool&依赖 +so datanodes register&namenode&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&multiple block pool&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&multiple block pool&依赖 +so datanodes register&namenode&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&namenode&依赖 +so datanodes register&namenode&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&cluster and store block&依赖 +so datanodes register&cluster and store block&依赖 +sole repository&metada&AGGREGATION +clients —&list file&依赖 +clients —&list file&依赖 +namenode&metada&依赖 +single point failure ( spof )&failure ( spof )&AGGREGATION +namenode&failure ( spof )&依赖 +whole Hadoop system&event&依赖 +administrator&new primary namenode&依赖 +administrator&filesystem metadata replica&依赖 +one&filesystem metadata replica&AGGREGATION +administrator&one&依赖 +new namenode&request&依赖 +its&image& +its&log& +ii )&block report&依赖 +it&memory&依赖 +it&namespace image&依赖 +it&namenode&依赖 +time&large cluster&依赖 +time&large cluster&依赖 +time&many files and block&依赖 +time&many files and block&依赖 +0.23 release series&situation&依赖 +0.23 release series&hadoop remedy&AGGREGATION +pair&implementation&依赖 +pair&implementation&依赖 +pair&implementation&依赖 +pair&implementation&依赖 +pair&namenode&AGGREGATION +standby&duty&依赖 +its&duties& +failure&active namenode&AGGREGATION +event&failure&AGGREGATION +namenode&highly-available shared storage&依赖 +namenode&memory& +datanode&namenode&依赖 +block mapping&’s memory&依赖 +datanode&block report&依赖 +namenode failover&mechanism&依赖 +transition&system&依赖 +transition&new entity&依赖 +first implementation&ZooKeeper&依赖 +case&routine maintenance&AGGREGATION +Failover&adminstrator&依赖 +Failover&routine maintenance&依赖 +Failover&example&依赖 +Failover&case&依赖 +failover controller&role&依赖 +failover controller&orderly transition&依赖 +failover controller&both namenode&依赖 +case&ungraceful failover&AGGREGATION diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" new file mode 100644 index 0000000000000000000000000000000000000000..7849dd744808c551c989ad4c8feb73c2f7b9f3f4 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer-relation.txt" @@ -0,0 +1,258 @@ +file system ( hdf ) architecture –&Guide&依赖 +every datum engineer download share aniruddha bhandari — october 28 , 2020 beginner big datum data engineering hadoop overview&Components&依赖 +every datum engineer download share aniruddha bhandari — october 28 , 2020 beginner big datum data engineering hadoop overview&HDFS&依赖 +Components&HDFS&AGGREGATION +massive amount&datum&AGGREGATION +your&Tweet& +you&next Tweet&依赖 +your&message& +you&step&依赖 +you&technology&依赖 +you&datum&依赖 +you&datum&依赖 +it&single machine&依赖 +you&lovely 3 AM tweets * cough *&依赖 +your&*& +I&thinking& +you&storage component&依赖 +you&Hadoop&依赖 +you&file system ( hdf&依赖 +your&side& +storage component&Hadoop&AGGREGATION +you&amazing power&依赖 +you&file system ( hdf&依赖 +you&Hadoop&依赖 +you&storage component&依赖 +you&amazing power&依赖 +you&storage component&依赖 +amazing power&file system ( hdf&AGGREGATION +you&Hadoop&依赖 +you&amazing power&依赖 +you&file system ( hdf&依赖 +It&Hadoop&依赖 +most important component&Hadoop&AGGREGATION +its&components& +we&article&依赖 +file system ( hdf&what&依赖 +table&Contents&AGGREGATION +component&hdf&AGGREGATION +HDFS Replication Management Replication&Blocks&AGGREGATION +huge volume&datum&AGGREGATION +It&datum&依赖 +It&huge volume&依赖 +It&single machine&依赖 +it&datum&依赖 +multiple machine&storage&依赖 +network&machine&AGGREGATION +file system ( hdf )&Hadoop&依赖 +datum&machine&依赖 +datum&distributed manner&依赖 +cluster&machine&AGGREGATION +datum&cluster&依赖 +few property&existence&依赖 +its&existence& +it&few property&依赖 +petabyte&datum&AGGREGATION +philosophy&most effective data processing pattern&依赖 +It&philosophy&依赖 +Cost-effective – hdf&commodity hardware&依赖 +cluster&commodity hardware&AGGREGATION +Cost-effective – hdf&cluster&依赖 +component&file system ( hdf )&AGGREGATION +hdf&two main component&依赖 +– data blocks and node&data block&依赖 +hdf break&file&依赖 +hdf break&file&依赖 +it&them&依赖 +smaller unit&hdf&依赖 +you&it&依赖 +size&default&依赖 +size&default&依赖 +you&requirement&依赖 +file&size 512MB&AGGREGATION +you&file&依赖 +you&size 512MB&依赖 +it&128MB each&依赖 +it&4 block&依赖 +file&size 524MB&AGGREGATION +you&size 524MB&依赖 +it&5 block&依赖 +4&128MB each&依赖 +5th&12MB&依赖 +last block&disk&依赖 +last block&complete 128MB&依赖 +multiple block&10KB&AGGREGATION +amount&petra byte&依赖 +we&Hadoop&依赖 +we&amount&依赖 +we&datum&依赖 +order&petra byte&AGGREGATION +amount&petra byte&依赖 +amount&datum&AGGREGATION +we&block&依赖 +we&small size&依赖 +colossal number&block&AGGREGATION +block&small size&AGGREGATION +block&lot&依赖 +block&overhead&依赖 +location&block&AGGREGATION +lot&overhead&AGGREGATION +it&it&依赖 +choke&single machine&AGGREGATION +It&proper spread&依赖 +It&workload&依赖 +proper spread&workload&AGGREGATION +they&block&依赖 +Namenode&master-worker architecture&依赖 +Namenode&master-worker architecture&依赖 +filesystem tree or hierarchy&files and directory&AGGREGATION +owner&file&AGGREGATION +It&file&依赖 +It&location&依赖 +block&file&AGGREGATION +It&block&依赖 +their&size& +information&two file&依赖 +information&form&依赖 +information&Fsimage&依赖 +information&local disk&依赖 +form&two file&AGGREGATION +fsimage store&information&依赖 +fsimage store&filesystem&依赖 +fsimage store&information&依赖 +fsimage store&filesystem&依赖 +fsimage store&information&依赖 +fsimage store&filesystem&依赖 +fsimage store&filesystem&依赖 +fsimage store&filesystem&依赖 +fsimage store&information&依赖 +fsimage store&information&依赖 +fsimage store&information&依赖 +fsimage store&filesystem&依赖 +it&replication level&依赖 +it&file&依赖 +their&sizes& +it&directory&依赖 +it&modification time and permission&依赖 +Edit log&write operation&依赖 +Edit log&track&依赖 +client&that&依赖 +track&write operation&AGGREGATION +Edit log&write operation&依赖 +Edit log&track&依赖 +client&hdf&依赖 +it&Namenode&依赖 +client&information&依赖 +Namenode&block&依赖 +Namenode&location&依赖 +Namenode&location&依赖 +Namenode&block&依赖 +Namenode&block&依赖 +datanode&deletion , etc.&依赖 +datanode&block&依赖 +They&Namenode&依赖 +their&health& +They&heartbeat&依赖 +it&health&依赖 +Namenode&block&依赖 +list&block&AGGREGATION +mapping&block&AGGREGATION +Namenode&mapping&依赖 +Namenode&block&依赖 +Namenode&mapping&依赖 +DataNode&list&依赖 +Namenode&block&依赖 +DataNode&block&依赖 +Namenode&mapping&依赖 +its&memory& +node&addition&依赖 +node&node&依赖 +node&node&依赖 +node&cluster&依赖 +node&addition&依赖 +node&cluster&依赖 +two type&node&AGGREGATION +node&two type&依赖 +node&two type&依赖 +case&failure&AGGREGATION +latest copy&Edit Log&AGGREGATION +we&Edit Log&依赖 +we&latest copy&依赖 +track&transaction&AGGREGATION +we&long time&依赖 +Edit log&size&依赖 +we&node&依赖 +lot&time&AGGREGATION +filesystem&time&依赖 +we&Secondary Namenode&依赖 +Secondary Namenode&Namenode&GENERALIZATION +check‐points&’s in-memory file system metada&AGGREGATION +whose main task&Edit log&依赖 +primary&metadata& +Secondary Namenode&cluster&依赖 +whose&task& +lot&memory&AGGREGATION +Secondary namenode&separate node&依赖 +Secondary namenode&cluster&依赖 +Secondary Namenode&Namenode&依赖 +Secondary Namenode&name&依赖 +its&name& +It&Checkpointing&依赖 +copy&latest Fsimage&AGGREGATION +replication&block&AGGREGATION +one&HDFS&依赖 +one&block&依赖 +one&block&依赖 +one&best feature&AGGREGATION +one&block&依赖 +best feature&hdf&AGGREGATION +one&HDFS&依赖 +one&HDFS&依赖 +it&them&依赖 +it&block&依赖 +’s&question&依赖 +reliable storage component&Hadoop&AGGREGATION +Replication&blocks hdf&AGGREGATION +Replication&Hadoop&依赖 +Replication&Hadoop&依赖 +block&cluster&依赖 +block&different Data node&依赖 +two more copy&it&AGGREGATION +we&much storage&依赖 +5 block&128MB each&AGGREGATION +we&128MB each&依赖 +we&5 block&依赖 +more&machine&AGGREGATION +We&cluster&依赖 +do namenode&replica&依赖 +we&Rack&依赖 +we&look&依赖 +we&Hadoop&依赖 +Rack&machine&依赖 +collection&machine&AGGREGATION +Rack&30-40&依赖 +Rack&hadoop )&依赖 +Rack awareness Replica storage&reliability and read/write bandwidth&依赖 +we&fault tolerance&依赖 +replica&same node&依赖 +Hadoop&deal&依赖 +Hadoop&default strategy&依赖 +first replica&example&依赖 +client&same Datanode&依赖 +first replica&same Datanode&依赖 +second replica&different Datanode&依赖 +third replica&different Datanode&依赖 +third replica&same rack&依赖 +third replica&second&依赖 +subsequent replica&random Data node&依赖 +subsequent replica&cluster&依赖 +I&solid understanding&依赖 +it&datum&依赖 +I&what&依赖 +file system ( hdf )&what&依赖 +better understanding&Hadoop&AGGREGATION +I&Hadoop&依赖 +Definitive Guide&Guide&GENERALIZATION +I&Definitive Guide&依赖 +MapReduce Types&Tables&AGGREGATION +article&it&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..29ccf27c8a96e9747d79f32ab2a23ec10bf18885 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions-relation.txt @@ -0,0 +1,457 @@ +career&Exclusive offer&依赖 +career&Exclusive offer&依赖 +career&Big Data Course !!&依赖 +your&career& +career&Big Data Course !!&依赖 +your&questions& +world&system& +its&tolerance& +It&fault tolerance and high availability&依赖 +you&article&依赖 +you&HDFS Architecture Guide&依赖 +Hadoop HDFS&HDFS&GENERALIZATION +Assumptions and goal&HDFS design&AGGREGATION +HDFS architecture tutorial&NameNode , DataNode&依赖 +HDFS architecture tutorial&HDFS , Secondary node , checkpoint node , Backup Node&依赖 +HDFS architecture tutorial&HDFS&依赖 +HDFS architecture tutorial&detailed architecture&依赖 +detailed architecture&Hadoop HDFS&AGGREGATION +hdf feature&Rack awareness&依赖 +hdf feature&Rack awareness&依赖 +hdf store&running&依赖 +hdf store&large file&依赖 +cluster&commodity hardware&AGGREGATION +It&large file&依赖 +It&storage&依赖 +It&less number&依赖 +It&principle&依赖 +storage&less number&AGGREGATION +huge number&small file&AGGREGATION +less number&large file&AGGREGATION +principle&storage&AGGREGATION +HDFS stores datum&case&依赖 +case&hardware failure&AGGREGATION +hardware failure&failure&GENERALIZATION +HDFS stores datum&hardware failure&依赖 +HDFS instance&hundred or thousand&依赖 +HDFS instance&server machine&依赖 +hundred or thousand&server machine&AGGREGATION +system&data& +part&’s datum&AGGREGATION +huge number&component&AGGREGATION +core architectural goal&hdf&AGGREGATION +data access HDFS application&dataset&依赖 +data access HDFS application&streaming access&依赖 +their&datasets& +Hadoop hdf&user&依赖 +Hadoop hdf&batch processing&依赖 +Hadoop hdf&interactive use&依赖 +high throughput&data access&AGGREGATION +force&data access&依赖 +low latency&data access&AGGREGATION +force&data access&依赖 +force&low latency&依赖 +Large datasets hdf&large data set&依赖 +file&standard practice&依赖 +file&standard practice&依赖 +architecture&such a way&依赖 +huge amount&datum&AGGREGATION +architecture&best&依赖 +architecture&such a way&依赖 +architecture&hdf&AGGREGATION +architecture&best&依赖 +to hundred&node&AGGREGATION +ton&million&AGGREGATION +million&file&AGGREGATION +Simple coherency model&file&依赖 +Simple coherency model&write-once-read-many access model&依赖 +theory&write-once-read-many access model&AGGREGATION +Simple coherency model&theory&依赖 +MapReduce-based application or web crawler application&model&依赖 +main advantage&system&依赖 +overall throughput&system&AGGREGATION +main advantage&overall throughput&依赖 +It&network congestion&依赖 +it&one platform&依赖 +widespread adoption&hdf&AGGREGATION +large set&datum&AGGREGATION +Hadoop Distributed File System&master-slave architecture&依赖 +master-slave architecture&architecture&GENERALIZATION +cluster&single master node&依赖 +file&one or more block&依赖 +block&different slave machine&依赖 +you&this article )&依赖 +you&which&依赖 +block&file&AGGREGATION +master node store&master node store&依赖 +slave node&data block&依赖 +data block&file&AGGREGATION +slave node&file&依赖 +centerpiece&Hadoop Distributed File System&AGGREGATION +NameNode&Hadoop Distributed File System&依赖 +It&file system namespace&依赖 +namenode store information&two file&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&two file&依赖 +namenode store information&two file&依赖 +namenode store information&two file&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&namenode store information&依赖 +namenode store information&two file&依赖 +form&two file&AGGREGATION +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +namenode store information&two file&依赖 +namenode store information&two file&依赖 +namenode store information&form&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&form&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&form&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&local disk&依赖 +namenode store information&two file&依赖 +namenode store information&two file&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&two file&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&local disk&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&namenode store information&依赖 +namenode store information&two file&依赖 +namenode store information&local disk&依赖 +namenode store information&namenode store information&依赖 +namenode store information&form&依赖 +namenode store information&form&依赖 +fsimage stand&File System image&依赖 +fsimage stand&File System image&依赖 +It&NameNode creation&依赖 +It&complete namespace&依赖 +complete namespace&Hadoop file system&AGGREGATION +It&recent change&依赖 +It&file system namespace operation&依赖 +function&HDFS NameNode&AGGREGATION +NameNode&DataNodes&依赖 +mapping&block&AGGREGATION +It&DataNodes&依赖 +It&mapping&依赖 +It&block&依赖 +It&file&依赖 +namenode record&change&依赖 +namenode record&made&依赖 +It&location&依赖 +It&file&依赖 +location&block&AGGREGATION +It&block&依赖 +NameNode&care&依赖 +NameNode&block&依赖 +NameNode&replication factor&依赖 +replication factor&block&AGGREGATION +NameNode&datanode&依赖 +NameNode&heartbeat and block report&依赖 +NameNode&new datanode&依赖 +NameNode&new replica&依赖 +NameNode&failure&依赖 +NameNode&Hadoop2&依赖 +single point&failure&AGGREGATION +High Availability Hadoop cluster architecture&two or more namenode&依赖 +High Availability Hadoop cluster architecture&running&依赖 +High Availability Hadoop cluster architecture&two or more namenode&依赖 +High Availability Hadoop cluster architecture&running&依赖 +datanode&Hadoop HDFS&依赖 +They&file&依赖 +They&block&依赖 +function&DataNode DataNode&AGGREGATION +DataNodes&block creation&依赖 +datanode&heartbeat&依赖 +health&hdf&AGGREGATION +datanode&NameNode&依赖 +list&block&AGGREGATION +datanode&block report&依赖 +datanode&namenode&依赖 +hdf architecture secondary namenode&hdf architecture secondary namenode&依赖 +daemon&DataNode and NameNode&依赖 +daemon&DataNode and NameNode&依赖 +Secondary NameNode&primary NameNode&依赖 +Secondary NameNode&helper node&依赖 +helper node&node&GENERALIZATION +NameNode&file&依赖 +NameNode&restart&依赖 +NameNode&long time&依赖 +size&edit log&AGGREGATION +Secondary NameNode&issue&依赖 +Secondary NameNode&NameNode&GENERALIZATION +Secondary NameNode download&NameNode&依赖 +Secondary NameNode download&file&依赖 +It&Fsimage&依赖 +It&edit log&依赖 +its&restart& +updated Fsimage&NameNode&依赖 +NameNode&edit log record&依赖 +secondary NameNode&hdf&依赖 +secondary NameNode®ular checkpoint&依赖 +node&checkpoint&依赖 +node&namespace&依赖 +checkpoint&namespace&AGGREGATION +Checkpoint Node&hadoop first download fsimage&依赖 +Checkpoint Node&hadoop first download fsimage&依赖 +it&them Fsimage and edit&依赖 +it&Fsimage and edit&依赖 +it&new image&依赖 +it&new image&依赖 +directory&same structure&依赖 +directory&Namenode ’s directory&依赖 +It&latest checkpoint&依赖 +It&directory&依赖 +Backup node&node&GENERALIZATION +Backup node&in-memory , up-to-date copy&依赖 +Backup node&in-memory , up-to-date copy&依赖 +Backup node&file system namespace&依赖 +in-memory , up-to-date copy&file system namespace&AGGREGATION +Backup node&file system namespace&依赖 +It&active NameNode state&依赖 +It&namespace state&依赖 +It&up-to-date state&依赖 +It&namespace state&依赖 +up-to-date state&namespace state&AGGREGATION +It&up-to-date state&依赖 +it&namespace&依赖 +NameNode&time&依赖 +NameNode&one Backup node&依赖 +different type&node&AGGREGATION +we&HDFS&依赖 +we&HDFS Architecture tutorial&依赖 +we&Blocks&依赖 +us&block&依赖 +us&hdf&依赖 +hdf&block-sized chunk&依赖 +hdf&file&依赖 +size&block&AGGREGATION +size&128 mb&依赖 +size&default&依赖 +One&block size&依赖 +One&requirement&依赖 +block size&size&GENERALIZATION +hdf&four block&依赖 +size 128 Mb&128 Mb&GENERALIZATION +file&size 612 Mb&AGGREGATION +hdf&size 128 Mb&依赖 +four block&size 128 Mb&AGGREGATION +hdf&four block&依赖 +one block&size 100 Mb&AGGREGATION +hdf&size 128 Mb&依赖 +file&smaller size&AGGREGATION +file&full block size space&依赖 +file&2 Mb space&依赖 +file&size 2 Mb&AGGREGATION +file&disk&依赖 +user&location&依赖 +user&block&依赖 +user&control&依赖 +what&HDFS fault-tolerant&依赖 +datum&other machine&依赖 +datum&multiple place&依赖 +datum&distributed system&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&block&AGGREGATION +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&hdfs store replica&依赖 +hdfs store replica&Hadoop&依赖 +replication factor©&依赖 +number©&AGGREGATION +three copy&different datanode&依赖 +three copy&block&AGGREGATION +block&replica&依赖 +block&block&依赖 +block&other DataNode&依赖 +replica&block&AGGREGATION +data block&block&GENERALIZATION +128 = 384 ) 384 mb&disk space&AGGREGATION +we&128 Mb&依赖 +we&file&依赖 +file&128 Mb&AGGREGATION +128 = 384 ) 384 mb&file&依赖 +replication mechanism&HDFS fault-tolerant&依赖 +collection&40-50 machine datanode&AGGREGATION +Rack&40-50 machine datanode&依赖 +NameNode&rack awareness algorithm&依赖 +NameNode&replica&依赖 +second replica&same rack&依赖 +second replica&other DataNode&依赖 +client&hdf&依赖 +it&metada&依赖 +client&file&依赖 +it&NameNode&依赖 +their&location& +Namenode&block&依赖 +number&block&AGGREGATION +Namenode&number&依赖 +client&DataNode&依赖 +client&DataNode 1&依赖 +IP&other two datanode&AGGREGATION +client&block A&依赖 +Datanode 1&client&依赖 +datanode 1 copy&same rack&依赖 +datanode 1 copy&same rack&依赖 +datanode 1 copy&same block&依赖 +datanode 1 copy&same rack&依赖 +Datanode 1&block a&依赖 +datanode 1 copy&DataNode 2&依赖 +datanode 1 copy&same block&依赖 +datanode 1 copy&DataNode 2&依赖 +datanode 1 copy&same rack&依赖 +datanode 1 copy&same block&依赖 +DataNode 2&same rack&AGGREGATION +datanode 1 copy&DataNode 2&依赖 +datanode 1 copy&same block&依赖 +datanode 1 copy&DataNode 2&依赖 +DataNodes&transfer&依赖 +DataNodes&rack switch&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&different rack&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&same block&依赖 +datanode 2 copy&different rack&依赖 +it&Namenode&依赖 +DataNode&client&依赖 +it&confirmation&依赖 +DataNode&block&依赖 +same process&file&依赖 +same process&block&依赖 +client&metada&依赖 +client&NameNode&依赖 +Namenode&location&依赖 +location&datanode&AGGREGATION +client&DataNodes&依赖 +client&data parallelly&依赖 +datum&client&依赖 +datum&DataNode&依赖 +it&block&依赖 +it&form&依赖 +client or application&file&依赖 +client or application&block&依赖 +it&original file&依赖 +form&original file&AGGREGATION +client&Hadoop HDFS&依赖 +client&file&依赖 +file&data block&依赖 +file&A , B , C in ###&依赖 +file&A , B , C&依赖 +file&block&依赖 +block&different datanode&依赖 +Block A and datanode-1 ( dn-1 )&datanode-6 ( dn-6 )&依赖 +Block A and datanode-1 ( dn-1 )&block b&依赖 +Block A and datanode-1 ( dn-1 )&datanode-6 ( dn-6 )&依赖 +Block A and datanode-1 ( dn-1 )&block b&依赖 +2 replica&block&AGGREGATION +case&datanode failure or rack failure&AGGREGATION +we&file&依赖 +size&default&依赖 +we&requirement&依赖 +size&default&依赖 +we&which&依赖 +master node ( namenode ) store&metada&依赖 +master node ( namenode ) store&block location&依赖 +Master Node&DataNodes&依赖 +hdf&block&依赖 +hdf&replica&依赖 +NameNode&Rack Awareness algorithm&依赖 +our&course& +Your&Career& +you&difficulty&依赖 +you&HDFS Architecture tutorial&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3ce0b17765db36bdf66e9250cf0cd8366610eaa --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-relation.txt @@ -0,0 +1,359 @@ +post&monitor hadoop health and performance&依赖 +post&4-part series&依赖 +part 1&4-part series&AGGREGATION +Part 4&Hadoop deployment&依赖 +Part 2&part 3 detail&依赖 +Hadoop&metrics& +Apache Hadoop&large data set&依赖 +Apache Hadoop&Hadoop&GENERALIZATION +distributed computation and storage&large data set&AGGREGATION +Apache Hadoop&computer cluster&依赖 +Apache Hadoop&distributed computation and storage&依赖 +Hadoop&’s mapreduce programming model&实现 +Google&model& +rich ecosystem&related technology&AGGREGATION +Hadoop&Facebook&依赖 +Hadoop&widespread adoption&依赖 +Hadoop&many company&依赖 +Hadoop architecture overview Hadoop&three core component&依赖 +you&high availability&依赖 +hdf&term “ master ”&依赖 +hdf&primary node&依赖 +we&more inclusive term “ leader&依赖 +we&original term&依赖 +we&case&依赖 +file system&Hadoop cluster&AGGREGATION +Hadoop cluster&cluster&GENERALIZATION +file system ( hdf )&Hadoop cluster&依赖 +Several attribute&other distributed file system&依赖 +Several attribute&hdf&依赖 +default block size&128 MB&AGGREGATION +total&three copy&AGGREGATION +block&two replica&依赖 +datum&default replication factor&依赖 +default replication factor&three&AGGREGATION +datum&three&依赖 +hdf&it&依赖 +hdf&cluster&依赖 +Vanilla HDFS High-availability HDFS hdf&leader/follower architecture&依赖 +cluster&single NameNode&依赖 +event&failure )&AGGREGATION +arbitrary number&DataNodes&AGGREGATION +NameNode&file&依赖 +NameNode&once broker&依赖 +NameNode&file system namespace&依赖 +NameNode&client&依赖 +NameNode&addition&依赖 +NameNode&leader and brokers access&依赖 +its&state& +NameNode&state&依赖 +It&failure&依赖 +It&Hadoop cluster&依赖 +It&single point&依赖 +single point&failure&AGGREGATION +production cluster&case&依赖 +case&a single disk failure )&AGGREGATION +production cluster&state&依赖 +production cluster&a single disk failure )&依赖 +production cluster&state&依赖 +case&total machine failure )&AGGREGATION +standby NameNode&NameNode&GENERALIZATION +Hadoop&standby NameNode&依赖 +Earlier version&Hadoop&AGGREGATION +Earlier version&SecondaryNameNode concept&依赖 +introduction&SecondaryNameNode concept&AGGREGATION +Earlier version&introduction&依赖 +today&SecondaryNameNode&依赖 +Earlier version&alternative&依赖 +function&SecondaryNameNode&AGGREGATION +understand&explanation&依赖 +understand&mechanism&依赖 +NameNode&state&依赖 +explanation&mechanism&AGGREGATION +NameNode&mechanism&依赖 +fsimage&fsimage&依赖 +fsimage&NameNode stores file system metada&依赖 +fsimage&two different file&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +complete snapshot&’s metada&AGGREGATION +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +system&metadata& +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&’s metada&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&’s metada&依赖 +fsimage store&complete snapshot&依赖 +fsimage store&’s metada&依赖 +fsimage store&’s metada&依赖 +Incremental change&durability&依赖 +Incremental change&edit log&依赖 +NameNode&place&依赖 +NameNode&state&依赖 +separation&concern&AGGREGATION +NameNode&concern&依赖 +NameNode&state&依赖 +copy&fsimage&AGGREGATION +its©& +SecondaryNameNode&fsimage&依赖 +time change&edit log&依赖 +SecondaryNameNode©&依赖 +cluster administrator&fsimage&依赖 +NameNode&SecondaryNameNode&依赖 +cluster administrator&fsimage&依赖 +presence&SecondaryNameNode&AGGREGATION +NameNode&presence&依赖 +updated copy&fsimage&AGGREGATION +top&fsimage&AGGREGATION +cluster administrator&updated copy&依赖 +NameNode&edit log&依赖 +cluster administrator&updated copy&依赖 +event&NameNode failure&AGGREGATION +its&name& +HA NameNode service Early version&several concept&依赖 +HA NameNode service Early version&Hadoop&AGGREGATION +HA NameNode service Early version&secondarynamenode&依赖 +HA NameNode service Early version&other )&依赖 +mechanism&Hadoop 2.0&依赖 +event&primary NameNode failure&AGGREGATION +high availability&shared storage&依赖 +high availability&shared storage&依赖 +high availability&primary and standby&依赖 +high availability&edit log )&依赖 +high availability&primary and standby&依赖 +high availability&edit log )&依赖 +consistency&active and standby state&AGGREGATION +group&journalnodes ( jns )&AGGREGATION +majority&JournalNodes&AGGREGATION +Active node&namespace&依赖 +it&record&依赖 +it&JournalNodes&依赖 +record&change&AGGREGATION +it&change&依赖 +it&majority&依赖 +Active node&node&GENERALIZATION +StandbyNode&edit log&依赖 +its&namespace& +StandbyNode&jn&依赖 +StandbyNode&change&依赖 +QJM interaction diagram JournalNode daemon&low overhead&依赖 +daemon&Hadoop node&依赖 +daemon&same machine&依赖 +daemon&ResourceManager&依赖 +edit log change&JNs&依赖 +quorum&JNs&AGGREGATION +you&odd number&依赖 +odd number&least three daemon&AGGREGATION +you&running&依赖 +edit log change&quorum&依赖 +number&jn )&AGGREGATION +JournalNodes&failure&依赖 +JournalNodes&2 node&依赖 +N&jn )&依赖 +Alternative file systems hdf&Hadoop&依赖 +number&alternative file system&AGGREGATION +’s file system abstraction&local file system&依赖 +’s file system abstraction&alternative file system&依赖 +Azure&system& +’s file system abstraction&number&依赖 +Hadoop&abstraction& +file system&access URI&依赖 +production hdf&production hdf&依赖 +Most&limitation&依赖 +core&set&依赖 +core&MapReduce job&AGGREGATION +MapReduce job&job&GENERALIZATION +core&input datum&依赖 +collection&< key&AGGREGATION +top-level unit&work&AGGREGATION +job&one or more map&依赖 +canonical example&MapReduce job&AGGREGATION +canonical example&word frequency&依赖 +body&text&AGGREGATION +image&example&依赖 +MapReduce&own resource allocation and job scheduling&依赖 +care&own resource allocation and job scheduling&AGGREGATION +earlier version&hadoop ( pre-2 .0 )&AGGREGATION +its&allocation& +MapReduce&own resource allocation and job scheduling&依赖 +MapReduce&care&依赖 +MapReduce&care&依赖 +MapReduce&own resource allocation and job scheduling&依赖 +MapReduce&care&依赖 +MapReduce&own resource allocation and job scheduling&依赖 +MapReduce&care&依赖 +Newer version&computation&依赖 +allocation&computational resource&AGGREGATION +Newer version&scheduling&依赖 +Newer version&hadoop ( 2.0 + )&AGGREGATION +Hadoop&box&依赖 +number&framework&AGGREGATION +Hadoop&MapReduce&依赖 +article series&compute framework&依赖 +article series&MapReduce&依赖 +Hadoop architecture&three core component&依赖 +Hadoop architecture&three core component&依赖 +YARN&uncommon way&依赖 +YARN&common term&依赖 +most people&container ”&依赖 +a resource container ( rc )&collection&依赖 +collection&physical resource&AGGREGATION +a resource container ( rc )&physical resource&依赖 +it&new meaning&依赖 +“ Application ”&YARN&依赖 +application&set&依赖 +application&task&依赖 +set&task&AGGREGATION +MapReduce&concept& +Application&’s job concept&依赖 +Application&’s job concept&依赖 +ResourceManager The ResourceManager&YARN&依赖 +inventory&available resource&AGGREGATION +most important&which&AGGREGATION +scheduler scheduler component&YARN ResourceManager&AGGREGATION +scheduler scheduler component&resource&依赖 +it&application status or progress&依赖 +it&monitoring&依赖 +YARN&several scheduler policy&依赖 +YARN&Hadoop 2.7.2&依赖 +Scheduler&resource&依赖 +bundle&physical resource )&AGGREGATION +default scheduler&Hadoop distribution&依赖 +its&instance& +application&own dedicated ApplicationMaster instance&依赖 +application&own dedicated ApplicationMaster instance&依赖 +instance&one&依赖 +one&node&AGGREGATION +instance&node&依赖 +instance&cluster&依赖 +its&container& +instance&own , separate container&依赖 +’s applicationmaster&ResourceManager&依赖 +’s applicationmaster&heartbeat message&依赖 +application&ApplicationMaster& +assignment&Container Resource lease&AGGREGATION +Additional resource&assignment&依赖 +Additional resource&Container Resource lease&依赖 +Additional resource&ResourceManager&依赖 +ApplicationMaster&execution&依赖 +execution&application&AGGREGATION +ApplicationMaster&application&依赖 +ApplicationMaster&full lifespan&依赖 +its&lifespan& +their&lifecycles& +nodemanager&earlier version&依赖 +earlier version&Hadoop&AGGREGATION +nodemanager&tasktracker&依赖 +nodemanager&Hadoop&依赖 +nodemanager&dynamically create , arbitrarily-sized resource containers ( rc )&依赖 +nodemanager&number&依赖 +tasktracker&fixed number&依赖 +fixed number&map&AGGREGATION +tasktracker&map&依赖 +number&dynamically create , arbitrarily-sized resource containers ( rc )&AGGREGATION +application&flow&依赖 +application&flow&依赖 +application&flow&依赖 +application&flow&依赖 +Client program&MapReduce application&依赖 +MapReduce application&application&GENERALIZATION +Client program&ResourceManager&依赖 +ResourceManager&ApplicationMaster&依赖 +ResourceManager&container&依赖 +applicationmaster boot&original calling client&依赖 +applicationmaster boot&ResourceManager&依赖 +applicationmaster boot&ResourceManager&依赖 +applicationmaster boot&ResourceManager&依赖 +applicationmaster boot&original calling client&依赖 +applicationmaster boot&original calling client&依赖 +client application&application&GENERALIZATION +ApplicationMaster&client application&依赖 +ApplicationMaster&resource and ( resource container&依赖 +NodeManager&container&依赖 +ApplicationMaster&container launch specification&依赖 +ApplicationMaster&NodeManager&依赖 +NodeManager&application&依赖 +client poll&execution&依赖 +client poll&application status and progress&依赖 +applicationmaster deregister&completion&依赖 +applicationmaster deregister&ResourceManager&依赖 +its&containers& +applicationmaster deregister&completion&依赖 +applicationmaster deregister&completion&依赖 +applicationmaster deregister&ResourceManager&依赖 +applicationmaster deregister&ResourceManager&依赖 +coordination and synchronization&distributed system&AGGREGATION +high-availability&former single point&AGGREGATION +former single point&failure&AGGREGATION +NameNode&failure —&依赖 +previous version&Hadoop&AGGREGATION +NameNode&single point&依赖 +single point&failure —&AGGREGATION +Hadoop 2.0&high-availability NameNode service&依赖 +Hadoop 2.0&many improvement&依赖 +Hadoop 2.0&them&依赖 +ZooKeeper&qjm or nf&依赖 +it&automatic failover&依赖 +ZooKeeper&conjunction&依赖 +Automatic NameNode failover&two component&依赖 +NameNode&equivalent&依赖 +NameNode and Standby NameNodes&ZooKeeper&依赖 +NameNode and Standby NameNodes&persistent session&依赖 +NameNode&file or directory&依赖 +NameNode&a regular file system )&依赖 +NameNode&special , ephemeral “ lock ” znode&依赖 +its&session& +NameNode&ZooKeeper ensemble&依赖 +NameNode&contact&依赖 +equivalent&file or directory&AGGREGATION +health&node&AGGREGATION +a failover (&health&依赖 +a failover (&node&依赖 +other node&lock (&依赖 +new namenode transition&active NameNode&依赖 +new namenode transition&active NameNode&依赖 +new namenode transition&active NameNode&依赖 +its&ResourceManager& +Hadoop 2.4&’s resilience&依赖 +release&ResourceManager high-availability feature&AGGREGATION +Hadoop 2.4&ResourceManager high-availability feature&依赖 +YARN&resilience& +Hadoop 2.4&release&依赖 +event&primary ’s failure&AGGREGATION +new feature&ZooKeeper&依赖 +YARN&similar , ZooKeeper-managed lock&依赖 +YARN&hdf&依赖 +ActiveStandbyElector service&ResourceManager process&依赖 +its&service& +part&ResourceManager process&AGGREGATION +YARN&mechanism& +ActiveStandbyElector service&ephemeral znode&依赖 +ActiveStandbyElector service&control&依赖 +ActiveStandbyElector service&ZKFailoverController&依赖 +ActiveStandbyElector service&control&依赖 +ActiveStandbyElector service&ephemeral znode&依赖 +control&ephemeral znode and ActiveStandbyElectorLock&AGGREGATION +ActiveStandbyElector service&ZKFailoverController&依赖 +RM&lock&依赖 +active RM&session&依赖 +RM&active state&依赖 +RM&ActiveStandbyElectorLock&依赖 +we&found&依赖 +we&found&依赖 +we&core component&依赖 +we&core component&依赖 +examination&’s key performance metric and health indicator&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt new file mode 100644 index 0000000000000000000000000000000000000000..26b458fe0aa746913c9988ea22390885585428ff --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-simEnts.txt @@ -0,0 +1,76 @@ +Namenode,Namenode +NameNode,class INode +NameNode,class INodeReference +NameNode,class INodesInPath +NameNode,class INodeDirectory +NameNode,class INodeWithAdditionalFields +NameNode,class XAttrFeature +NameNode,class FileUnderConstructionFeature +NameNode,class DirectoryWithSnapshotFeature +NameNode,class DirectorySnapshottableFeature +NameNode,class AclFeature +NameNode,class DirectoryWithQuotaFeature +NameNode,class EditLogFileOutputStream +NameNode,class EditLogBackupOutputStream +NameNode,class QuorumOutputStream +NameNode,class JournalSetOutputStream +NameNode,class EditLogFileInputStream +NameNode,class EditLogBackupInputStream +NameNode,class EditLogByteInputStream +NameNode,class RedundantEditLogInputStream +INode,INode +INodeReference,INodeReference +INodesInPath,INodesInPath +INodeDirectory,INodeDirectory +INodeWithAdditionalFields,INodeWithAdditionalFields +Feature,Feature +XAttrFeature,XAttrFeature +FileUnderConstructionFeature,FileUnderConstructionFeature +DirectoryWithSnapshotFeature,DirectoryWithSnapshotFeature +DirectorySnapshottableFeature,DirectorySnapshottableFeature +AclFeature,AclFeature +DirectoryWithQuotaFeature,DirectoryWithQuotaFeature +EditLogFileOutputStream,EditLogFileOutputStream +EditLogBackupOutputStream,EditLogBackupOutputStream +EditLogBackupOutputStream,Edit log +QuorumOutputStream,QuorumOutputStream +QuorumOutputStream,ByteRangeInputStream +JournalSetOutputStream,JournalSetOutputStream +EditLogFileInputStream,EditLogFileInputStream +EditLogBackupInputStream,EditLogBackupInputStream +EditLogBackupInputStream,Edit log +EditLogByteInputStream,EditLogByteInputStream +EditLogByteInputStream,Edit log +RedundantEditLogInputStream,RedundantEditLogInputStream +Datanode,Datanode +DataNode,class Storage +DataNode,class DataStorage +DataNode,class StorageInfo +DataNode,class BlockPoolSlice +DataNode,class FsVolumeImpl +DataNode,class BlockManager +Tools,class DFSAdmin +Tools,class AdminHelper +Tools,class ECAdmin +Tools,class CryptoAdmin +Balancer,class DFSAdmin +Balancer,class AdminHelper +Balancer,class ECAdmin +Balancer,class CryptoAdmin +Protocol,class ClientProtocol +Protocol,class DataNodeProtocol +Protocol,class InterDataNodeProtocol +Security,class LightWeightHashSet +Security,class LightWeightLinkedSet +Security,class LinkedSetIterator +Security,class ImageVisitor +Security,class LsImageVisitor +Security,class XmlImageVisitor +Security,class FileDistributionVisitor +Security,class IndentedImageVisitor +Client,class DFSClient +Client,class DFSOutputStream +Client,class DfsClientConf +Client,class BlockReaderFactory +Client,class StrippedDataStreamer +Common,class Command diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt index ff2709878573fee0a1b9ad45dec9703024d6e3a9..117d4bb994227d32f316e7e1518d473bba5a51d7 100644 --- a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt @@ -1,4 +1,23 @@ Namenode , Namenode +NameNode , class I Node +NameNode , class I Node Reference +NameNode , class I Nodes In Path +NameNode , class I Node Directory +NameNode , class I Node With Additional Fields +NameNode , class X Attr Feature +NameNode , class File Under Construction Feature +NameNode , class Directory With Snapshot Feature +NameNode , class Directory Snapshottable Feature +NameNode , class Acl Feature +NameNode , class Directory With Quota Feature +NameNode , class Edit Log File Output Stream +NameNode , class Edit Log Backup Output Stream +NameNode , class Quorum Output Stream +NameNode , class Journal Set Output Stream +NameNode , class Edit Log File Input Stream +NameNode , class Edit Log Backup Input Stream +NameNode , class Edit Log Byte Input Stream +NameNode , class Redundant Edit Log Input Stream INode , INode INode Reference , INode Reference INodes In Path , INodes In Path @@ -23,4 +42,35 @@ Edit Log Backup Input Stream , Edit log Edit Log Byte Input Stream , Edit Log Byte Input Stream Edit Log Byte Input Stream , Edit log Redundant Edit Log Input Stream , Redundant Edit Log Input Stream -Datanode , Datanode \ No newline at end of file +Datanode , Datanode +DataNode , class Storage +DataNode , class Data Storage +DataNode , class Storage Info +DataNode , class Block Pool Slice +DataNode , class Fs Volume Impl +DataNode , class Block Manager +Tools , class D F S Admin +Tools , class Admin Helper +Tools , class E C Admin +Tools , class Crypto Admin +Balancer , class D F S Admin +Balancer , class Admin Helper +Balancer , class E C Admin +Balancer , class Crypto Admin +Protocol , class Client Protocol +Protocol , class Data Node Protocol +Protocol , class Inter Data Node Protocol +Security , class Light Weight Hash Set +Security , class Light Weight Linked Set +Security , class Linked Set Iterator +Security , class Image Visitor +Security , class Ls Image Visitor +Security , class Xml Image Visitor +Security , class File Distribution Visitor +Security , class Indented Image Visitor +Client , class D F S Client +Client , class D F S Output Stream +Client , class Dfs Client Conf +Client , class Block Reader Factory +Client , class Stripped Data Streamer +Common , class Command \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" new file mode 100644 index 0000000000000000000000000000000000000000..eeb4d17504acdf54850585ebe2c2dc45d921f33b --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory-relation.txt" @@ -0,0 +1,152 @@ +Hadoop cluster&kove ® xpd ™ persistent memory&依赖 +Hadoop cluster&kove ® xpd ™ persistent memory&依赖 +its&information& +Hadoop cluster&NameNode server&依赖 +failure and source&data loss&AGGREGATION +Hadoop cluster&most vital information&依赖 +Hadoop cluster&RAM&依赖 +potential point&failure and source&AGGREGATION +its&server& +usual precaution&form&依赖 +datum&case&依赖 +its&operation& +datum&failure&依赖 +case&failure&AGGREGATION +it&many hour&依赖 +one&hadoop functionality&依赖 +one&hadoop functionality&依赖 +one&fast restoration&依赖 +one&fast restoration&依赖 +one&fast restoration&依赖 +fast restoration&hadoop functionality&AGGREGATION +one&fast restoration&依赖 +one&hadoop functionality&依赖 +Hadoop NameNode&NameNode&GENERALIZATION +one&hadoop functionality&依赖 +Hadoop software&NameNode&依赖 +Hadoop software&memory space&依赖 +Hadoop software&memory space&依赖 +modified version&Hadoop software&AGGREGATION +memory space&NameNode&AGGREGATION +Hadoop software&NameNode&依赖 +Standard RAM&yet another limitation&依赖 +Standard RAM&Hadoop&依赖 +Standard RAM&RAM&GENERALIZATION +it&size&依赖 +One&file&依赖 +One&much datum&依赖 +Kove XPD&size&依赖 +removal&limitation&AGGREGATION +Kove XPD&contrast&依赖 +its&block& +sake&efficiency&AGGREGATION +it&failure&依赖 +single point&failure&AGGREGATION +prospect&persistent memory&依赖 +prospect&NameNode&依赖 +advantage&implementation&AGGREGATION +power failure&failure&GENERALIZATION +Persistent memory&power failure&依赖 +device&terabyte&依赖 +size&Kove memory&AGGREGATION +NameNode&more information&依赖 +NameNode&much memory&依赖 +number&file&AGGREGATION +diagram below&thought&依赖 +our&thoughts& +summary&approach&AGGREGATION +our&approaches& +we&major actor&依赖 +structure&FSImage structure INode&依赖 +structure&INode&依赖 +structure&CorruptReplicasMap and recentInvalidateSets and PendingBlockInfo and ExcessReplicateMap and PendingReplicationBlocks and UnderReplicatedBlocks&依赖 +structure&interest&AGGREGATION +new list&block&AGGREGATION +/ /&block&依赖 +/ /&new list&依赖 +/ /&new list&依赖 +/ /&block&依赖 +possible way&usage&依赖 +possible way&special buffer&依赖 +usage&special buffer&AGGREGATION +Registration&time (&依赖 +Registration&100 microsecond&依赖 +we&4 way&依赖 +we&it&依赖 +buffer&start&依赖 +cost&data transfer to/from other memory area&AGGREGATION +start&NameNode&AGGREGATION +buffer&NameNode&依赖 +different chunk&datum&AGGREGATION +buffer&time&依赖 +combination&a ) and ( b )&AGGREGATION +we&Kove&依赖 +we&it&依赖 +we&place and transfer&依赖 +area&interest&AGGREGATION +we&buffer&依赖 +May&additional code&依赖 +May&deal&依赖 +May&caching&依赖 +overhead&what&依赖 +Easiest to implement&library created buffer&依赖 +we&NameNode change&实现 +we&NameNode change&实现 +we&EHCache library&实现 +we&EHCache library&实现 +your&database& +We&it&依赖 +implementation&github here and https://github.com/markkerzner/nn_kove&依赖 +combination&teragen/terasort&AGGREGATION +testing&use nnbench&依赖 +testing&use nnbench&依赖 +result&run&AGGREGATION +performance&cluster&AGGREGATION +50 %&in-memory Hadoop code&AGGREGATION +KDSA block device&block device&GENERALIZATION +initial prototype&Kove XPD&依赖 +our&prototype& +initial prototype&KDSA block device&依赖 +block device&device&GENERALIZATION +performance&block device&AGGREGATION +proper way&direct write&依赖 +C interface&performance&依赖 +proper way&Java&依赖 +C interface&block device&依赖 +we&more meticulous implementation&依赖 +four group&test result&AGGREGATION +slots_millis_maps =&6462&依赖 +Launched map task&2&依赖 +slots_millis_reduces =&9238&依赖 +Bytes Read&= 50000000&依赖 +File&Counters& +file_bytes_read =&51000264&依赖 +hdfs_bytes_read =&50000218&依赖 +file_bytes_written =&102164352&依赖 +hdfs_bytes_written =&50000000&依赖 +spill records = 1000000&spill records = 1000000&依赖 +split_raw_bytes =&218&依赖 +Reduce input record&500000&依赖 +Reduce input group&500000&依赖 +Reduce output record&500000&依赖 +Number&file&AGGREGATION +# map&barrier&依赖 +#&exception&AGGREGATION +slots_millis_maps =&6541&依赖 +slots_millis_reduces =&9293&依赖 +file_bytes_written =&102156988&依赖 +slots_millis_maps =&6249&依赖 +slots_millis_reduces =&9218&依赖 +file_bytes_written =&102156990&依赖 +slots_millis_maps =&6390&依赖 +slots_millis_reduces =&9240&依赖 +file_bytes_written =&102162937&依赖 +fast block copy&datum&AGGREGATION +Planned enhancement&fuller utilitzation&依赖 +its©& +Planned enhancement&capability&依赖 +terabyte&datum&AGGREGATION +fuller utilitzation&capability&AGGREGATION +fast block copy&terabyte&AGGREGATION +matter&second&AGGREGATION +capability&Kove XPD&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc6b4ee069bb658a04c893c7d0434b9596a7e483 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture-relation.txt @@ -0,0 +1,176 @@ +get start key design&hdfs architecture march 31 , 2021 hdf ( hadoop&AGGREGATION +get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 +get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 +get start key design&get start key design&依赖 +get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 +get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 +get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 +get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 +get start key design&get start key design&依赖 +get start key design&get start key design&依赖 +get start key design&get start key design&依赖 +get start key design&Global Edge Network Docs Resources Blog Case Studies Content Library Solution Briefs Changelog Engineering Education Partners About Section Pricing Contact Log&依赖 +get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 +get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 +get start key design&get start key design&依赖 +get start key design&appspace gps_fixedcore platform section control plane edge appspace adaptive edge engine ( aee )&依赖 +several feature&design&AGGREGATION +It&Hadoop framework&实现 +Hadoop framework&framework&GENERALIZATION +hdf ( hadoop&key difference&依赖 +hdf ( hadoop&key difference&依赖 +hdf ( hadoop&other distributed file system&依赖 +fault-tolerance&low-cost hardware&依赖 +hdf ( hadoop&other distributed file system&依赖 +big datum framework general design&hdf datum storage policy colocation&AGGREGATION +what traditional datum processing software application&what traditional datum processing software application&依赖 +its&benefits& +Big data framework&4Vs namely&依赖 +framework&processing&依赖 +framework&datum&依赖 +framework&datum&依赖 +framework&datum&依赖 +( massive amount&datum&AGGREGATION +framework&processing&依赖 +framework&processing&依赖 +processing&datum&AGGREGATION +file system ( hdf )&Hadoop framework&依赖 +hdf&Hadoop technical framework&依赖 +distributed file system&Hadoop technical framework&AGGREGATION +It&scenario&依赖 +website user datum behavior datum storage&website user datum behavior datum storage&依赖 +its&architecture& +General design&design feature&依赖 +design feature&efficient working&依赖 +design feature&architecture&AGGREGATION +its&working& +General design&HDFS architecture The hdf&AGGREGATION +design feature&following&依赖 +General design&architecture&依赖 +hdf&namespace and storage&依赖 +hdf&distinction&依赖 +hdf&feature&依赖 +hdf&data replication&依赖 +data replication&system&依赖 +availability&system&AGGREGATION +data replication&availability&依赖 +single block&datum&AGGREGATION +client&2 other node&依赖 +single block&3 node&依赖 +client&block&依赖 +failure&‘ DataNode ’&依赖 +HDFS framework&framework&GENERALIZATION +primary component&meta-data&依赖 +meta-data&file&AGGREGATION +primary component&file&依赖 +‘ NameNode ’&HDFS framework&依赖 +master node&node&GENERALIZATION +It&master node&依赖 +creation , deletion , and replication&data block&AGGREGATION +node&actual datum&依赖 +node&hdf&依赖 +its&space& +number&replica&AGGREGATION +hdf consist&NameNodes and DataNodes&依赖 +hdf consist&NameNodes and DataNodes&依赖 +hdf consist&NameNodes and DataNodes&AGGREGATION +one NameNode&client&依赖 +single cluster&one NameNode&依赖 +one NameNode&data access&依赖 +DataNode&instruction&依赖 +DataNode&NameNode&依赖 +hdf&coherent file system&依赖 +external process&system&依赖 +external process&one unified system&依赖 +file&blocks& +number&application&依赖 +NameNode&change&依赖 +namenode insert&new file&依赖 +namenode insert&new file&依赖 +creation&new file&AGGREGATION +namenode insert&record&依赖 +namenode insert&creation&依赖 +namenode insert&creation&依赖 +namenode insert&new file&依赖 +new file&hdf&依赖 +namenode insert&creation&依赖 +namenode insert&record&依赖 +namenode insert&record&依赖 +robustness&failure&依赖 +3 common type&failure&AGGREGATION +robustness&failure&依赖 +robustness&3 common type&依赖 +Its&robustness& +robustness&3 common type&依赖 +datum&size 64MB&依赖 +block&size 64MB&AGGREGATION +datum&hdf&依赖 +datum&block&依赖 +hdf&stored datum&依赖 +failure&component&AGGREGATION +completeness&stored datum&AGGREGATION +case&failure&AGGREGATION +hdf&completeness&依赖 +DataNode periodically report&’ message&依赖 +DataNode periodically report&NameNode&依赖 +NameNode&procedure&依赖 +data balance mechanism&datum&依赖 +even distribution&datum&AGGREGATION +data balance mechanism&even distribution&依赖 +Ensures data balance&data balance mechanism&依赖 +data balance mechanism&DataNodes&依赖 +snapshot mechanism&file system&AGGREGATION +Data storage policy&5 storage policy&依赖 +One_SSD – Storage&single replica&AGGREGATION +All_SSD – Storage&replica&AGGREGATION +HDFS NameNode&NameNode&GENERALIZATION +HDFS NameNode&datanode&依赖 +layered storage select&layered data storage&依赖 +layered storage select&proper storage device&依赖 +four type&storage device&AGGREGATION +disk ( mechanical hard disk and ram_disk ( memory virtualization hard disk&ssd ( solid-state disk&依赖 +disk ( mechanical hard disk and ram_disk ( memory virtualization hard disk&ssd ( solid-state disk&依赖 +tag storage select&directory tag&依赖 +tag storage select&directory tag&依赖 +tag storage select&proper DataNode&依赖 +tag storage select&proper DataNode&依赖 +directory tag&data importance level&依赖 +node group storage stores key datum&reliable node group&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&reliable node group&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&reliable node group&依赖 +node group storage stores key datum&reliable node group&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&node group storage stores key datum&依赖 +node group storage stores key datum&reliable node group&依赖 +DataNode cluster&heterogeneous server&依赖 +node group storage stores key datum&reliable node group&依赖 +Colocation&associated data or datum&依赖 +storage&associated data or datum&AGGREGATION +great consumption&network resource&AGGREGATION +massive migration&datum&AGGREGATION +datum&massive datum and system performance&依赖 +processing speed&massive datum and system performance&AGGREGATION +benefit&colocation Reduces network bandwidth&AGGREGATION +strength&hdf&AGGREGATION +its&fault-tolerance& +its&ability& +distinct difference&fault-tolerance&依赖 +its&throughput& +Relevant resources HDFS Architecture Guide characteristic&hdfs big data huawei peer review contributions by&AGGREGATION +author ruth mare ruth&Kenyatta University&依赖 +She&computer and cloud network&依赖 +She&research and collaboration&依赖 +article&engineering education program&依赖 +Section&Program& +student member&engineering education program&AGGREGATION +article&student member&依赖 +next generation&engineer&AGGREGATION +community-generated pool&resource&AGGREGATION +Section&pool& +Slack community&Careers Legals Resources Blog Case Studies Content Library Solution Briefs Partners Changelog Support Docs Community Slack Help & Support Platform Status Pricing Section&依赖 +our&community& +Slack community&Slack Company&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..54c71d366613fecf47341b3f418f9c7cfbc18921 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design-relation.txt @@ -0,0 +1,527 @@ +1 Introduction ....................................................................................................................... 3 2 Assumptions and Goals ..................................................................................................... 3 2.1 Hardware Failure&3 2.2 Streaming Data Access&依赖 +1 Introduction ....................................................................................................................... 3 2 Assumptions and Goals ..................................................................................................... 3 2.1 Hardware Failure&3 2.3 Large Data set&依赖 +Dhruba Borthakur Table&contents&AGGREGATION +persistence&file system metadata ......................................................................... 7 7&AGGREGATION +Heartbeats&re-replication ....................................................... 8 8.2 cluster rebalancing ......................................................................................................&依赖 +........................................................................................................... 8 copyright � 2005&Apache Software Foundation&依赖 +Blocks&Hadoop Distributed File System&依赖 +Blocks&.................................................................................................................. 9 9.2 staging ........................................................................................................................ 10 9.3 pipelining .................................................................................................................... 10 10 accessibility .................................................................................................................. 10 10.1 dfsshell ................................................................................................................... 11 10.2 dfsadmin ................................................................................................................ 11 10.3 browser interface ...................................................................................................... 11 11 space reclamation ........................................................................................................ 11 11.1 file deletes&依赖 +Blocks&..................................................................................................................... 12&依赖 +8.4 metadata disk failure .................................................................................................. 9 8.5 snapshot ...................................................................................................................... 9 9 data organization ............................................................................................................. 9 9.1 datum&8.4 metadata disk failure .................................................................................................. 9 8.5 snapshot ...................................................................................................................... 9 9 data organization ............................................................................................................. 9 9.1 datum&依赖 +It&many similarity&依赖 +It&distributed file system&依赖 +application&large dataset&依赖 +hdf&few POSIX requirement&依赖 +hdf&infrastructure&依赖 +hdf&open source web crawler Apache Nutch project&依赖 +part&Lucene Apache Project&AGGREGATION +part&Hadoop Project&AGGREGATION +hdf&Hadoop Project&依赖 +Hardware Failure Hardware Failure&exception&依赖 +entire HDFS file system&server machine&依赖 +server machine&file system datum&依赖 +hundreds or thousand&server machine&AGGREGATION +server machine&piece&依赖 +entire HDFS file system&hundreds or thousand&依赖 +piece&file system datum&AGGREGATION +huge number&component&AGGREGATION +non-trivial probability&failure&AGGREGATION +component&failure&依赖 +component&non-trivial probability&依赖 +component&hdf&AGGREGATION +detection&hdf&依赖 +core architectural goal&hdf&AGGREGATION +detection&hdf&依赖 +detection&fault&AGGREGATION +their&data& +hdf&batch processing&依赖 +emphasis&latency&依赖 +latency&data access&AGGREGATION +emphasis&data access&依赖 +throughput&data access&AGGREGATION +emphasis&data access&依赖 +POSIX&many hard requirement&依赖 +Large Data Sets application&large data set&依赖 +hundred&node&AGGREGATION +It&ten&依赖 +It&file&依赖 +It&million&依赖 +It&single cluster&依赖 +million&file&AGGREGATION +ten&million&AGGREGATION +design page 3 copyright � 2005&Apache Software Foundation&依赖 +Most HDFS application&write-once-read-many access model&依赖 +Most HDFS application&file&依赖 +assumption&data coherency issue&实现 +Map-Reduce application&model&依赖 +Map-Reduce application&application&GENERALIZATION +size&data set&AGGREGATION +network congestion and increase overall throughput&system&AGGREGATION +Portability&portable&依赖 +Portability&such a way&依赖 +it&one platform&依赖 +platform&choice&AGGREGATION +widespread adoption&hdf&AGGREGATION +large set&application&AGGREGATION +namenode and datanode hdfs&master/slave architecture&依赖 +master/slave architecture&architecture&GENERALIZATION +HDFS cluster&master server&依赖 +master server&filesystem namespace&依赖 +HDFS cluster&cluster&GENERALIZATION +HDFS cluster&single Namenode&依赖 +number and one&addition&依赖 +number and one&addition&依赖 +number and one&addition&依赖 +number and one&addition&依赖 +number and one&Datanodes&AGGREGATION +cluster&storage&依赖 +hdf&a file system namespace&依赖 +set&Datanodes&AGGREGATION +file&one or more block&依赖 +block&set&依赖 +block&Datanodes&依赖 +etc.&files and directory&AGGREGATION +Namenode&filesystem namespace operation&依赖 +mapping&block&AGGREGATION +It&mapping&依赖 +It&Datanodes&依赖 +Datanodes&block creation&依赖 +block creation&creation&GENERALIZATION +Datanodes&instruction&依赖 +Datanodes&Namenode&依赖 +Namenode and Datanode&software&依赖 +piece&software&AGGREGATION +machine&Java&依赖 +machine&Namenode&依赖 +Usage&portable Java language&AGGREGATION +wide range&machine&AGGREGATION +dedicated machine&machine&GENERALIZATION +Namenode software&software&GENERALIZATION +typical deployment&dedicated machine&依赖 +dedicated machine&Namenode software&依赖 +one instance&Datanode software&AGGREGATION +Datanode software&software&GENERALIZATION +design page 4 copyright � 2005&Apache Software Foundation&依赖 +existence&single Namenode&AGGREGATION +existence&architecture&实现 +existence&system&实现 +architecture&system&AGGREGATION +Namenode&HDFS metada&依赖 +system&flows&依赖 +system&such a way&依赖 +user datum&Namenode&依赖 +File System Namespace hdf&traditional hierarchical file organization&依赖 +user&directory&依赖 +user&directory and store file&依赖 +file system namespace hierarchy&most other existing file system&依赖 +One&file&依赖 +hdf&user quota&实现 +hdf&hard link&依赖 +HDFS architecture&feature&实现 +HDFS architecture&architecture&GENERALIZATION +Namenode&file system namespace&依赖 +change&Namenode&依赖 +number&replica&AGGREGATION +application&number&依赖 +replica&file&AGGREGATION +number&file&AGGREGATION +application&file&依赖 +copy&file&AGGREGATION +number©&AGGREGATION +replication factor&file&AGGREGATION +information&Namenode&依赖 +It&file&依赖 +It&sequence&依赖 +sequence&block&AGGREGATION +It&block&依赖 +Blocks&fault tolerance&依赖 +block size and replication factor&file&依赖 +application&file&依赖 +application&replica&依赖 +replication&block&AGGREGATION +Namenode&replication&依赖 +Namenode&block&依赖 +Namenode&decision&依赖 +receipt&heartbeat&AGGREGATION +list&block&AGGREGATION +Blockreport&list&依赖 +Blockreport&Datanode&依赖 +Blockreport&block&依赖 +selection&placement&AGGREGATION +placement&replica&AGGREGATION +feature&most other distributed file system&依赖 +feature&hdf&依赖 +lot&tuning and experience&AGGREGATION +feature&lot&依赖 +feature&tuning and experience&依赖 +purpose&rack-aware replica placement&AGGREGATION +purpose&data reliability&依赖 +design page 5 copyright � 2005&Apache Software Foundation&依赖 +implementation&direction&依赖 +implementation&direction&依赖 +short-term goal&it&依赖 +its&behavior& +hdf&cluster&依赖 +hdf&computer&依赖 +cluster&computer&AGGREGATION +Datanode&rack&依赖 +Datanode&startup time&依赖 +Namenode&rack id&AGGREGATION +rack identity&machine&AGGREGATION +simple but non-optimal policy&replica&依赖 +entire rack&multiple rack&依赖 +entire rack&use&依赖 +use&bandwidth&AGGREGATION +component failure&failure&GENERALIZATION +policy&cluster&依赖 +it&load&依赖 +policy&replica&依赖 +it&component failure&依赖 +write&block&依赖 +policy&cost&依赖 +HDFS.s placement policy&one replica&依赖 +inter-rack write traffic&inter-rack write traffic&依赖 +policy cut&performance&依赖 +chance&rack failure&AGGREGATION +policy&impact datum reliability and availability guarantee&依赖 +it&aggregate network bandwidth&依赖 +datum&three&依赖 +datum&two unique rack&依赖 +replica&rack&依赖 +other one third&replica&AGGREGATION +two third&replica&AGGREGATION +One third&replica&AGGREGATION +other one third&rack&依赖 +policy&performance&依赖 +implementation&above policy&AGGREGATION +Replica Selection hdf&read request&依赖 +Replica Selection hdf&replica&依赖 +HDFS cluster&multiple data center&依赖 +replica&remote replica&依赖 +Namenode&special state&依赖 +Namenode&special state&依赖 +Replication&data block&AGGREGATION +design page 6 copyright � 2005&Apache Software Foundation&依赖 +Namenode&Heartbeat The Hadoop Distributed File System&依赖 +Blockreport&data block&依赖 +Blockreport&Namenode&依赖 +list&data block&AGGREGATION +a datanode report&Namenode&依赖 +a datanode report&Namenode&依赖 +Blockreport&a datanode report&依赖 +block&replica&依赖 +block&specified minimum number&依赖 +specified minimum number&replica&AGGREGATION +data block&block&GENERALIZATION +replica&data block&AGGREGATION +minimum number&replica&AGGREGATION +configurable percentage&safely-replicated data block&AGGREGATION +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +namenode exit&Safemode state&依赖 +It&list&依赖 +It&data block&依赖 +It&)&依赖 +specified number&replica&AGGREGATION +Namenode&block&依赖 +Namenode&other datanode&依赖 +HDFS namespace&Namenode&依赖 +Persistence&File System Metadata&AGGREGATION +Namenode&transaction log&依赖 +Namenode&EditLog&依赖 +Namenode&system metada&依赖 +Namenode&file&依赖 +Namenode&local file system&依赖 +its&system& +entire file system namespace&FsImage&依赖 +entire file system namespace&file&依赖 +Namenode.s local file system&local file system&GENERALIZATION +FsImage&Namenode.s local file system&依赖 +Namenode&memory&依赖 +Namenode&entire file system namespace and file blockmap&依赖 +image&entire file system namespace and file blockmap&AGGREGATION +large number&files and directory&AGGREGATION +Namenode machine&machine&GENERALIZATION +in-memory representation&FsImage&AGGREGATION +it&FsImage and EditLog&依赖 +it&disk&依赖 +It&old EditLog&依赖 +transaction&persistent FsImage&依赖 +its&transactions& +checkpoint¤t implementation&依赖 +Work&periodic checkpointing&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +Datanode&knowledge&依赖 +Datanode&HDFS file&依赖 +It&HDFS datum&依赖 +It&block&依赖 +It&HDFS datum&依赖 +block&HDFS datum&AGGREGATION +It&block&依赖 +It&block&依赖 +It&HDFS datum&依赖 +Datanode&file&依赖 +Datanode&same directory&依赖 +optimal number&file&AGGREGATION +it&heuristic&依赖 +It&subdirectory&依赖 +It&local file&依赖 +local file system&single directory&依赖 +local file system&huge number&依赖 +huge number&file&AGGREGATION +It&same directory&依赖 +list&HDFS data block&AGGREGATION +it&local file system&依赖 +Communication Protocol All communication protocol&TCP/IP protocol&依赖 +top&TCP/IP protocol&AGGREGATION +Communication Protocol All communication protocol&top&依赖 +client&Namenode machine&依赖 +client&connection&依赖 +client&well-defined and configurable port&依赖 +It&Namenode&依赖 +It&ClientProtocol&依赖 +Datanodes&DatanodeProtocol&依赖 +Datanodes&Namenode&依赖 +( rpc ) abstraction&ClientProtocol&依赖 +Namenode&RPC&依赖 +Namenode&design&依赖 +It&RPC request&依赖 +robustness primary objective&hdf&AGGREGATION +robustness primary objective&datum&依赖 +presence&failure&AGGREGATION +three type&common failure&AGGREGATION +Data Disk Failure&Namenode&依赖 +Data Disk Failure&heartbeat message&依赖 +network partition&subset&依赖 +network partition&Datanodes&依赖 +subset&Datanodes&AGGREGATION +lack&heartbeat message&AGGREGATION +namenode mark&datanode&依赖 +namenode mark&datanode&依赖 +datum&hdf&依赖 +replication factor&block&AGGREGATION +their&value& +Namenode&block&依赖 +increase&replication factor&AGGREGATION +HDFS architecture&data rebalancing scheme&依赖 +free space&certain threshold&依赖 +free space&certain threshold&依赖 +sudden high demand&other datum&依赖 +sudden high demand&creation&依赖 +sudden high demand&cluster&依赖 +creation&additional replicas and rebalancing&AGGREGATION +additional replicas and rebalancing&other datum&AGGREGATION +sudden high demand&additional replicas and rebalancing&依赖 +sudden high demand&additional replicas and rebalancing&依赖 +sudden high demand&cluster&依赖 +sudden high demand&other datum&依赖 +sudden high demand&creation&依赖 +type&scheme&AGGREGATION +block&datum&AGGREGATION +design page 8 copyright � 2005&Apache Software Foundation&依赖 +HDFS client&HDFS file&实现 +contents&HDFS file&AGGREGATION +HDFS file&file&GENERALIZATION +HDFS client&checksum checking&实现 +HDFS client&contents&实现 +HDFS client&client&GENERALIZATION +it&checksum&依赖 +it&block&依赖 +it&block&依赖 +it&checksum&依赖 +it&checksum&依赖 +it&block&依赖 +it&block&依赖 +checksum&block&AGGREGATION +client&HDFS file&依赖 +it&checksum&依赖 +file contents&contents&GENERALIZATION +it&checksum&依赖 +client&file contents&依赖 +replica&block&AGGREGATION +Datanode&replica&依赖 +Datanode&block&依赖 +central data structure&hdf&AGGREGATION +Metadata Disk Failure The FsImage&hdf&依赖 +corruption&file&AGGREGATION +corruption&entire cluster&依赖 +multiple copy&FsImage and EditLog&AGGREGATION +update&updated synchronously&依赖 +synchronous update&rate&依赖 +rate&namespace transaction&AGGREGATION +synchronous update&second&依赖 +synchronous update&namespace transaction&依赖 +synchronous update&multiple EditLog&AGGREGATION +Namenode&latest consistent FsImage and EditLog to use&依赖 +Namenode machine&HDFS cluster&依赖 +Namenode machine&failure&依赖 +single point&failure&AGGREGATION +automatic restart and failover&Namenode software&AGGREGATION +particular instant&time&AGGREGATION +copy&datum&AGGREGATION +snapshot snapshot©&依赖 +snapshot snapshot&support&依赖 +snapshot snapshot&datum&依赖 +One usage&snapshot-feature&AGGREGATION +One usage&corrupted cluster&依赖 +HDFS current&snapshot&依赖 +they&datum one or more time&依赖 +application&datum&依赖 +hdf&write-once-read-many semantics&依赖 +hdf&file&依赖 +chunk&different datanode&依赖 +HDFS client&file datum&依赖 +HDFS client&temporary local file&依赖 +HDFS client&fact&依赖 +local file&HDFS block size&依赖 +client contact&Namenode&依赖 +client contact&Namenode&依赖 +local file&data worth&依赖 +client contact&Namenode&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +identity&datanode (&AGGREGATION +Namenode&identity&依赖 +Namenode&datanode (&依赖 +Namenode&client request&依赖 +client&datum&依赖 +client&block&依赖 +client&datum&依赖 +client&datum&依赖 +client&block&依赖 +client&block&依赖 +un-flushed datum&Datanode&依赖 +client&Namenode&依赖 +Namenode&persistent store&依赖 +Namenode&point&依赖 +Namenode&file creation operation&依赖 +careful consideration&target application&AGGREGATION +above approach&target application&依赖 +above approach&careful consideration&依赖 +application&streaming write&依赖 +application&file&依赖 +network speed&writes&依赖 +client&client side buffering&依赖 +client&remote file&依赖 +network speed&network impact throughput&依赖 +e.g. AFS&client side caching&依赖 +e.g. AFS&earlier distribute file system&依赖 +higher performance&data upload&AGGREGATION +POSIX requirement&data upload&依赖 +POSIX requirement&higher performance&依赖 +client&datum&依赖 +client&HDFS file&依赖 +its&data& +datum&local file&依赖 +HDFS file&replication factor&依赖 +replication factor&three&AGGREGATION +HDFS file&three&依赖 +client&list&依赖 +local file&block&依赖 +local file&user datum&依赖 +list&Datanodes&AGGREGATION +block&user datum&AGGREGATION +client&Namenode&依赖 +Datanodes&replica&依赖 +list&Datanodes&依赖 +Datanodes&block&依赖 +client&first Datanode&依赖 +client&data block&依赖 +its&repository& +first Datanode&datum&依赖 +portion&data block&AGGREGATION +second Datanode&data block&依赖 +second Datanode&portion&依赖 +third Datanode&datum&依赖 +third Datanode&local repository&依赖 +it&next one&依赖 +Datanode&pipeline&依赖 +it&pipeline&依赖 +Datanode&datum&依赖 +it&same time&依赖 +Datanode&previous one&依赖 +datum&one Datanode&依赖 +datum&next&依赖 +Accessibility hdf&application&依赖 +Accessibility hdf&many different way&依赖 +design page 10 copyright � 2005&Apache Software Foundation&依赖 +DFSShell hdf&user datum&依赖 +form&files and directory&AGGREGATION +DFSShell&user interact&依赖 +syntax&command set&AGGREGATION +application&language&依赖 +directory&/ foodir&依赖 +command syntax&application&依赖 +browser interface a typical hdf&web-server&依赖 +hdf namespace and view contents&HDFS file&AGGREGATION +file&user&依赖 +it&hdf&依赖 +hdf&/ trash directory&依赖 +hdf&it&依赖 +hdf&file&依赖 +design page 11 copyright � 2005&Apache Software Foundation&依赖 +file&configurable amount&依赖 +configurable amount&time&AGGREGATION +file&/ trash&依赖 +file&time&依赖 +expiry&life&AGGREGATION +Namenode&/ trash&依赖 +Namenode&file&依赖 +Namenode&HDFS namespace&依赖 +Namenode&file&依赖 +its&life& +deletion&block&依赖 +deletion&file&AGGREGATION +time&corresponding increase&AGGREGATION +user&file&依赖 +it&/ trash directory&依赖 +user&file&依赖 +he/she&that&依赖 +he/she&/ trash directory&依赖 +/ trash directory&file&依赖 +/ trash directory&latest copy&依赖 +latest copy&file&AGGREGATION +hdf&directory&依赖 +/ trash directory&one special feature&依赖 +hdf&policy&依赖 +hdf&file&依赖 +current default policy&file&依赖 +policy&future&依赖 +policy&defined interface&依赖 +Namenode&excess replica&依赖 +next heartbeat transfer&information&依赖 +corresponding free space&cluster&依赖 +Datanode&corresponding block&依赖 +completion&setReplication apus&AGGREGATION +appearance&free space&AGGREGATION +hdf source code&Hadoop Distributed File System&依赖 +design page 12 copyright � 2005&Apache Software Foundation&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc48d661c1f2e305c392a5574aa7ba8f56ceade --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture-relation.txt @@ -0,0 +1,635 @@ +Scalable HDFS Architecture Farag Azzedin Information&corporation&依赖 +Computer Science Department King Fahd University&petroleum and minerals dhahran&AGGREGATION +Scalable HDFS Architecture Farag Azzedin Information&cost&依赖 +One&area&AGGREGATION +Apache Hadoop&large scale data processing project&依赖 +Apache Hadoop&Hadoop&GENERALIZATION +one&data-intensive distributed application&依赖 +one&large scale data processing project&AGGREGATION +Hadoop application&distributed file system&依赖 +Hadoop application&data storage&依赖 +data storage&storage&GENERALIZATION +Hadoop application&file system ( hdf )&依赖 +master node&node&GENERALIZATION +its&RAM& +HDFS architecture&architecture&GENERALIZATION +metada&storage node&AGGREGATION +HDFS architecture&single master node&依赖 +HDFS architecture&single master node&依赖 +NameNode&Datanodes&依赖 +HDFS&NameNode& +HDFS Datanodes ’ metada&’s single-point-of-failure namenode&依赖 +capacity&RAM&AGGREGATION +HDFS Datanodes ’ metada&RAM&依赖 +HDFS Datanodes ’ metada&capacity&依赖 +RAM&’s single-point-of-failure namenode&AGGREGATION +paper&fault tolerant , highly available and widely scalable HDFS architecture&依赖 +drawback¤t HDFS architecture&AGGREGATION +this motivated researcher&this motivated researcher&依赖 +this motivated researcher&mapreduce&依赖 +this motivated researcher&system&依赖 +Google&MapReduce& +Apache Hadoop&cloud computing project&依赖 +reliable and scalable datum intensive distribute computing [ 2 , 3 , 4 ]&aiming&依赖 +one&cloud computing project&AGGREGATION +reliable and scalable datum intensive distribute computing [ 2 , 3 , 4 ]&Java&依赖 +its&applications& +HDFS implementation&even thousand&依赖 +HDFS implementation&server machine&依赖 +even thousand&server machine&AGGREGATION +system&data& +part&’s datum&AGGREGATION +HDFS implementation&implementation&GENERALIZATION +thus high probability&hardware failure&AGGREGATION +more server machine&more hardware&依赖 +component&hdf&AGGREGATION +faults detection&hdf [ 3 , 7 ]&依赖 +fundamental architectural goal&hdf [ 3 , 7 ]&AGGREGATION +same&HDFS cluster&依赖 +same&NameNode server&依赖 +HDFS cluster&cluster&GENERALIZATION +HDFS&performance& +availability&single NameNode machine&AGGREGATION +automatic restart and failover&NameNode software&AGGREGATION +Hadoop applications utilize HDFS [ 7 ]&Datanodes&依赖 +Hadoop applications utilize HDFS [ 7 ]&RAM [ 7 ]&依赖 +Hadoop applications utilize HDFS [ 7 ]&single master node&依赖 +Hadoop applications utilize HDFS [ 7 ]&NameNode&依赖 +its&]& +paper&NameNode&依赖 +case&single NameNode failure&AGGREGATION +HDFS NameNode&NameNode&GENERALIZATION +Several research project&Chord&依赖 +Several research project&basis&依赖 +Several research project&research&依赖 +their&research& +chord file system ( cfs ) store file&peer-to-peer system&依赖 +chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 +chord file system ( cfs ) store file&peer-to-peer system&依赖 +chord file system ( cfs ) store file&peer-to-peer system&依赖 +chord file system ( cfs ) store file&peer-to-peer system&依赖 +chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 +chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 +chord file system ( cfs ) store file&chord file system ( cfs ) store file&依赖 +Chord&algorithms& +value&[ 9 ]&依赖 +set&special root server&AGGREGATION +ordinary dn&set&依赖 +Chord-based dn&special server&依赖 +ordinary dn&special root server&依赖 +route information ( n record&name server hierarchy&依赖 +correctness&analogous route information [ 9 ]&AGGREGATION +dn&route information ( n record&依赖 +Chord&analogous route information [ 9 ]&依赖 +dn&manual management&依赖 +route information ( n record&client&依赖 +Chord&correctness&依赖 +manual management&route information ( n record&AGGREGATION +Chord&no name structure [ 9 ]&依赖 +dn&named hosts or service&依赖 +dn&task&依赖 +rest&follow&依赖 +rest&paper&AGGREGATION +Hadoop architecture&Section II&依赖 +Hadoop architecture&architecture&GENERALIZATION +Section IV&problem statement&依赖 +we¤t Hadoop architecture&依赖 +we¤t Hadoop architecture&依赖 +we&issue&依赖 +we¤t Hadoop architecture&依赖 +we&issue&依赖 +its&NameNode& +our&motivation& +we&issue&依赖 +area&future work&AGGREGATION +We&Section vius&依赖 +HADOOP ARCHITECTURE Hadoop&several sub-project&依赖 +MapReduce&4 ]&依赖 +HADOOP ARCHITECTURE Hadoop&hadoop common hdfs&依赖 +MapReduce&4 ]&依赖 +this section briefly&sub-project&依赖 +this section briefly&Hadoop namely Hadoop Common&依赖 +sub-project&Hadoop namely Hadoop Common&AGGREGATION +Hadoop Common&Filesystem&依赖 +contribution area&3 ]&依赖 +contribution area&other Hadoop community project&依赖 +its&storage& +Kosmos Distributed File System )&[ 3 ]&依赖 +Kosmos Distributed File System )&[ 3 ]&依赖 +ten&petabyte&AGGREGATION +petabyte&storage&AGGREGATION +hdf&OS [ 3 ]&依赖 +hdf&top&依赖 +filesystem&OS [ 3 ]&AGGREGATION +hdf&filesystem&依赖 +top&filesystem&AGGREGATION +hdf&Java language [ 3 ]&依赖 +master/slave architecture&architecture&GENERALIZATION +hdf&master/slave architecture&依赖 +typical Hadoop cluster&NameNode&依赖 +NameNode&HDFS namespace&依赖 +Datanodes&actual datum&依赖 +machine&GNU/Linux OS&依赖 +machine&Java&依赖 +hdf&machine&依赖 +Usage&portable and all pervasive Java language&AGGREGATION +wide range&machine&AGGREGATION +dedicated machine&machine&GENERALIZATION +dedicated machine&NameNode software&依赖 +typical deployment&dedicated machine&依赖 +one instance&Datanode software&AGGREGATION +architecture&multiple datanode&依赖 +MapReduce&huge data set&依赖 +MapReduce&its simplicity and functionality [&依赖 +MapReduce&7 ]&实现 +huge data set&distributed application&AGGREGATION +its&simplicity& +MapReduce&distributed application&依赖 +integral part&Hadoop&AGGREGATION +It&large data set&依赖 +It&distributed computing&依赖 +cluster&computer&AGGREGATION +It&computer&依赖 +It&cluster&依赖 +MapReduce&datum&依赖 +master node&major input&依赖 +master node&typical ― Map ‖ function&依赖 +worker node&process&依赖 +worker node&node&GENERALIZATION +worker node&received problem chunk&依赖 +master node&" Reduce " function&依赖 +master node&processed sub-problem&依赖 +part&Hadoop project&AGGREGATION +it&map and reduction operation&依赖 +it&unnoticed distributed processing&依赖 +unnoticed distributed processing&map and reduction operation&AGGREGATION +multiple map function¶llel&依赖 +number&CPUs&AGGREGATION +output&same reducer&依赖 +map operation&operation&GENERALIZATION +map operation&same key&依赖 +set&' reducer&AGGREGATION +output&map operation&AGGREGATION +MapReduce&larger dataset&依赖 +MapReduce&handle&依赖 +petabyte&datum&AGGREGATION +parallelism&high availability&依赖 +parallelism&probability&依赖 +parallelism&probability&依赖 +parallelism&high availability&依赖 +case&partial failure&AGGREGATION +parallelism&probability&依赖 +probability&high availability&AGGREGATION +partial failure&servers or storage&AGGREGATION +parallelism&high availability&依赖 +parallelism&high availability&依赖 +parallelism&probability&依赖 +parallelism&high availability&依赖 +parallelism&probability&依赖 +rack name&worker node&AGGREGATION +rack name&network switch&AGGREGATION +rack name&[ 3 ]&依赖 +information&Hadoop application&依赖 +information&command&依赖 +HDFS filesystem&datum&依赖 +they&information&依赖 +HDFS filesystem&filesystem&GENERALIZATION +case&rack power or switch failure&AGGREGATION +hdf&reliable and extremely fast computations [ 5 ]&依赖 +hdf&numerous data blocks replica&依赖 +hdf&a cluster&依赖 +hdf&communication and client&依赖 +communication and client&RPC&依赖 +hdf&TCP/IP layer&依赖 +hdf&64 mb )&依赖 +ideal file size&64 mb )&依赖 +multiple&64 mb )&AGGREGATION +hdf&multiple&依赖 +hdf&large file&依赖 +datum&three node&依赖 +datum&default replication value&依赖 +replication&[&AGGREGATION +Data node&datum&依赖 +Figure 1&hdf&依赖 +client&single NameNode machine&依赖 +client&file metada or file modification&依赖 +NameNode and Datanodes&built-in webservers [ 6 ]&依赖 +current status&cluster&AGGREGATION +their&]& +NameNode&HDFS metadata [ 7 ]&依赖 +system&flows&依赖 +system&such a way&依赖 +user datum&NameNode [ 7 ]&依赖 +hdfs architecture&work&依赖 +[ 7 ] NameNode&Datanode&依赖 +[ 7 ] NameNode&periodical heartbeat message&依赖 +case&network partition&AGGREGATION +subset&Datanodes&AGGREGATION +datanode&recent heartbeat&依赖 +Datanode death&block&依赖 +Datanode death&replication factor&依赖 +replication factor&block&AGGREGATION +their&value& +NameNode&which&依赖 +replication factor&file&AGGREGATION +hdf&HDFS namespace&依赖 +set&Datanodes&AGGREGATION +file&one or more block&依赖 +block&set&依赖 +block&Datanodes&依赖 +reference&block&AGGREGATION +NameNode&block&依赖 +NameNode&reference&依赖 +NameNode&reference&依赖 +NameNode&block&依赖 +NameNode&HDFS namespace operation&依赖 +NameNode&block&依赖 +mapping&block&AGGREGATION +NameNode&mapping&依赖 +NameNode&mapping&依赖 +NameNode&block&依赖 +system&clients& +Datanodes&block creation&依赖 +NameNode&HDFS namespace&依赖 +NameNode&Edit Log&依赖 +modification&place&依赖 +modification&file system metada&依赖 +transaction log&log&GENERALIZATION +NameNode&transaction log&依赖 +NameNode&file&依赖 +NameNode&local host OS file system&依赖 +its&system& +entire file system namespace&file&依赖 +NameNode&system& +Silage&’s local file system&依赖 +Silage&[ 7 ]&依赖 +Silage&file&依赖 +NameNode&memory& +image&entire file system namespace and file Block map&AGGREGATION +image&’s system memory ( ram )&依赖 +large number&files and directory&AGGREGATION +4GB&RAM&AGGREGATION +it&silage and edit log&依赖 +it&disk&依赖 +in-memory representation&Silage&AGGREGATION +It&old EditLog&依赖 +transaction&persistent FsImage&依赖 +its&transactions& +procedure&checkpoint&依赖 +NameNode&up [ 7 ]&依赖 +checkpoint¤t implementation&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&file&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&datanode store hdfs datum&依赖 +datanode store hdfs datum&local file system&依赖 +datanode store hdfs datum&file&依赖 +Datanode&knowledge&依赖 +Datanode&HDFS file&依赖 +It&HDFS datum&依赖 +It&block&依赖 +It&HDFS datum&依赖 +block&HDFS datum&AGGREGATION +It&block&依赖 +It&block&依赖 +It&HDFS datum&依赖 +Datanode&file&依赖 +Datanode&same directory&依赖 +optimal number&file&AGGREGATION +it&heuristic&依赖 +It&local file&依赖 +local file system&single directory&依赖 +local file system&huge number&依赖 +huge number&file&AGGREGATION +It&same directory&依赖 +list&HDFS data block&AGGREGATION +it&local file system&依赖 +NameNode&file&依赖 +NameNode&client request&依赖 +HDFS client&file datum&依赖 +HDFS client&temporary local file&依赖 +HDFS client&reality&依赖 +HDFS client&client&GENERALIZATION +Application write&temporary local file&依赖 +client contact&NameNode&依赖 +local file&datum worth&依赖 +local file&one HDFS block size&依赖 +client contact&NameNode&依赖 +client contact&NameNode&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +namenode insert&file name&依赖 +namenode insert&file system hierarchy&依赖 +NameNode&Datanode&依赖 +NameNode&identity&依赖 +identity&Datanode&AGGREGATION +NameNode&client request&依赖 +client&datum&依赖 +client&block&依赖 +client&datum&依赖 +block&datum&AGGREGATION +client&datum&依赖 +client&block&依赖 +client&block&依赖 +un-flushed datum&Datanode&依赖 +client&NameNode&依赖 +NameNode&persistent store&依赖 +NameNode&point&依赖 +NameNode&file creation operation&依赖 +its&store& +PROBLEM STATEMENT AND MOTIVATION&architecture&实现 +PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 +PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 +architecture&Hadoop [ 7 ]&AGGREGATION +PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 +PROBLEM STATEMENT AND MOTIVATION&architecture&实现 +PROBLEM STATEMENT AND MOTIVATION&architecture&实现 +PROBLEM STATEMENT AND MOTIVATION&Hadoop [ 7 ]&实现 +usage&single NameNode machine&AGGREGATION +PROBLEM STATEMENT AND MOTIVATION&architecture&实现 +simplicity&cost , namely scalability and high availability issue&依赖 +maximum number&Datanodes&AGGREGATION +hdf&one distinctive manager/controller server machine&依赖 +server machine&machine&GENERALIZATION +hdf&NameNode&依赖 +it&outstanding client request&依赖 +Datanode&operation&依赖 +NameNode server restoration process&hour&依赖 +hdf&Secondary Namenode&依赖 +NameNode&information& +directory information&information&GENERALIZATION +Secondary NameNode function&directory information&依赖 +periodic image-based snapshot&directory information&AGGREGATION +Secondary NameNode function&periodic image-based snapshot&依赖 +edit log&an up-to-date directory structure [ 3 ]&依赖 +entire journal&HDFS action&AGGREGATION +above-mentioned issue&make&依赖 +above-mentioned issue&us&依赖 +above-mentioned issue&us&依赖 +above-mentioned issue&a way&依赖 +above-mentioned issue&a way&依赖 +above-mentioned issue&make&依赖 +paper&problem&依赖 +paper&solution&依赖 +it&node [ 9 ]&依赖 +it&key&依赖 +it&unique key&依赖 +v. chord protocol functional&chord protocol&AGGREGATION +Chord&[ 10 ]&依赖 +node&equal number&依赖 +consistent hash&load balancing&依赖 +node&key&依赖 +less reallocation&key&AGGREGATION +node&system [ 9 ]&依赖 +equal number&key&AGGREGATION +chord protocol address&fundamental issue&依赖 +node&a cluster include load balancing&依赖 +chord protocol address&fundamental issue&依赖 +It&load balancing&依赖 +hash function&function&GENERALIZATION +large cluster&node&AGGREGATION +chord lookup cost increase&log&依赖 +log&number&AGGREGATION +chord lookup cost increase&node&依赖 +chord lookup cost increase&log&依赖 +chord lookup cost increase&number&依赖 +chord lookup cost increase&node&依赖 +number&node&AGGREGATION +chord lookup cost increase&log&依赖 +chord lookup cost increase&number&依赖 +chord lookup cost increase&node&依赖 +chord lookup cost increase&node&依赖 +chord lookup cost increase&chord lookup cost increase&依赖 +chord lookup cost increase&number&依赖 +chord lookup cost increase&number&依赖 +chord lookup cost increase&log&依赖 +chord lookup cost increase&chord lookup cost increase&依赖 +chord lookup cost increase&chord lookup cost increase&依赖 +chord lookup cost increase&chord lookup cost increase&依赖 +additional parameter tuning&scaling&依赖 +its&tables& +Chord&high availability&依赖 +fault tolerant cluster&node&AGGREGATION +system&change [ 9 ]&依赖 +continuous state&change [ 9 ]&AGGREGATION +application&it&依赖 +fail safe nature&Chord software&AGGREGATION +form&library&AGGREGATION +fail safe nature&form&依赖 +fail safe nature&library&依赖 +system&Chord&依赖 +system&two fold&依赖 +system&two fold&依赖 +system&Chord&依赖 +function&node&依赖 +application&Chord library&依赖 +function&IP address&依赖 +application&function&依赖 +IP address&node&AGGREGATION +Chord library&library&GENERALIZATION +set&key&AGGREGATION +application&node&依赖 +application&Chord software&依赖 +new node&cluster [ 9 ]&依赖 +update&application software&依赖 +their&nodes& +update&respective value&依赖 +user-friendly naming&datum&AGGREGATION +requirements implementation&implementation&GENERALIZATION +flat key-space feature&Chord&AGGREGATION +flat key-space feature&requirements implementation&实现 +cryptographic hash&datum&AGGREGATION +application&datum&依赖 +application&Chord key&依赖 +Chord key&key&GENERALIZATION +application&data replication&依赖 +data&identifier& +several content provider&other ’s datum&依赖 +set&software development project&AGGREGATION +node&software development project&依赖 +everyone&periodic release&依赖 +node&load& +aggregate cost&cluster&AGGREGATION +implementation&Chord&依赖 +implementation&map data block&依赖 +Dabek et al. [ 12 ]&implementation&依赖 +Dabek et al. [ 12 ]&concept&实现 +implementation&concept&AGGREGATION +implementation&map data block&依赖 +implementation&Chord&依赖 +Chord&load balance&依赖 +Chord&application&依赖 +their&data& +they&return&依赖 +they&having&依赖 +node&’s datum&依赖 +node&data& +data&name& +Several similar problem&cooperative mirroring application&依赖 +goal&high availability&依赖 +Our&aim& +Our&architecture& +proposed architecture&NameNode Clustered chord ( nucu )&依赖 +client&single NameNode machine&依赖 +client&single NameNode machine&依赖 +client&file metadata or file modification&依赖 +client&file metadata or file modification&依赖 +client&single NameNode machine&依赖 +client&file metadata or file modification&依赖 +client&file metadata or file modification&依赖 +client&single NameNode machine&依赖 +We&client request&依赖 +We&resource request&依赖 +resource request&key&依赖 +resource request&consistent hashing algorithm&依赖 +NameNode&resource request reply ( rrp )&依赖 +NameNode&client resource request&依赖 +client resource request&resource request&GENERALIZATION +RRQ&NCUC black-box&依赖 +client&respective Datanodes&依赖 +client&respective Datanodes&依赖 +workflow&Figure 2&依赖 +NCUC&NCUC NameNodes&依赖 +NCUC&following way&依赖 +NCUC&key&依赖 +identifier&NCUC identifier NameNode ring modulo 2k&依赖 +whose&identifier& +identifier&z&AGGREGATION +NCUC&a k-bit identifier use sha-1 [ 16 ]&依赖 +NCUC&NameNode&依赖 +NCUC&consistent hash function&依赖 +its&key& +NameNode&identifier& +NameNode&address& +NameNode&successor NameNode&依赖 +successor NameNode&key z , or succ ( z )&AGGREGATION +successor NameNode&NameNode&GENERALIZATION +NCUC black-box&five key&依赖 +NameNode&key z , or succ ( z )&依赖 +NCUC black-box&ten NameNodes&依赖 +circle&number&AGGREGATION +Figure3&NCUC ring&依赖 +NCUC ring&ten NameNodes&依赖 +NCUC ring&ten NameNodes&依赖 +so key 10&NameNode 14&依赖 +successor&identifier 10 , succ ( 10 )&AGGREGATION +key 24&NameNode 32&依赖 +namenodes join&NCUC cluster&依赖 +namenodes join&NCUC cluster&依赖 +namenodes join&little interruption&依赖 +namenodes join&little interruption&依赖 +NCUC cluster&cluster&GENERALIZATION +n&successor& +NameNode n&NCUC clustered ring&依赖 +n&departure& +n&keys& +further change&keys allocation&依赖 +further change&NCUC namenode&依赖 +it&identifier 24&依赖 +it&key&依赖 +it&identifier 32&依赖 +it&key&依赖 +it&NameNode&依赖 +quick distributed calculation&hash function&AGGREGATION +NCUC&quick distributed calculation&依赖 +NCUC&hash function&依赖 +NCUC map&consistent hash [ 14 , 15 ]&依赖 +Nth NameNode&NCUC cluster&依赖 +NCUC hash function&hash function&GENERALIZATION +Nth NameNode&NameNode&GENERALIZATION +NCUC hash function&load balancing&依赖 +NameNodes approximately equal number&key&AGGREGATION +merely o ( 1/n ) portion&key&AGGREGATION +NameNode&table&依赖 +Workflow&ncuc architecture begin client resource requests – hash&AGGREGATION +NameNode&o ( logn&依赖 +lookup&) message&依赖 +lookup&o ( log n&依赖 +usage&Chord protocol [ 9 ]&AGGREGATION +one&primary goal&AGGREGATION +It&single HDFS NameNode architecture&实现 +simplicity&single HDFS NameNode architecture&AGGREGATION +It&simplicity&依赖 +primary goal&simple HDFS architecture&AGGREGATION +its&limitation& +single point-of-failure HDFS NameNode&alternative solution&依赖 +our&implementation& +we&performance analysis first&依赖 +we&Chord&依赖 +we&set&依赖 +we&experiment&依赖 +our&architecture& +set&experiment&AGGREGATION +we&2 Linux Amazon Cloud EC2 node&依赖 +We¤t HDFS architecture&依赖 +Table 1¤t HDFS architecture&依赖 +size 512 MB&512 MB&GENERALIZATION +Table 1&result&依赖 +single file&size 512 MB&AGGREGATION +Table 1&single file&依赖 +Table 2&size 512 MB&依赖 +Table 2&size 512 MB&依赖 +Table 2&result&依赖 +Table 2&result&依赖 +Table 2&single file&依赖 +Table 2&single file&依赖 +512 MB&nrfile = 5 , replication = 1 op&依赖 +512 MB&nrfile = 5 , replication = 1 op&依赖 +I/O rate ( mb/s&0 0 0 0&依赖 +term&I/O rate&AGGREGATION +term&throughput&AGGREGATION +I/O rate ( mb/s&0 0 0 0&依赖 +IaaS , PaaS and saa&cost&依赖 +Hadoop application&data storage&依赖 +Hadoop application&primary distributed file system&依赖 +whose&NameNode& +proposed architecture&single-point-of-failure&依赖 +availability and scalability&HDFS architecture&AGGREGATION +its&single-point-of-failure& +proposed architecture&availability and scalability&依赖 +proposed architecture&HDFS architecture&依赖 +little complexity&approach&依赖 +little complexity&HDFS NameNode&依赖 +we&extensive experiment&依赖 +result&future extensive evaluation process&AGGREGATION +our&process& +acknowledgment author&support&依赖 +King Fahd University KFUPM&Petroleum and Minerals&AGGREGATION +project&number&GENERALIZATION +project&King Abdulaziz City kacst )&依赖 +project&Technology&依赖 +Berkeley View&Cloud Computing&AGGREGATION +technical report eecs-2009-28 and http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html and Feb.&technical report eecs-2009-28 and http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html and Feb.&依赖 +Theory&Computing&AGGREGATION +― Serving dn&Chord&依赖 +[ 12 ] F. Dabek&Proc&依赖 +― Analysis&evolution&AGGREGATION +evolution&peer-to-peer system&AGGREGATION +13 ] D. Liben-Nowell&Proc&依赖 +principle distribute computing ( podc ) and CA , July 2002 , pp.&distribute computing ( podc )&AGGREGATION +protocol&Proc&依赖 +protocol&Proc&依赖 +protocol&Proc&依赖 +Master&thesis& +Department&Electric&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..5329a9c3146391448a018b5f161bc023245fd2e1 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-relation.txt @@ -0,0 +1,920 @@ +mobile robot view project early detection&page&依赖 +Alzheimer&project& +mobile robot view project early detection&content&依赖 +Landmines detection&Mohammed Elmogy&依赖 +t�cnica vitivin�cola � july 2016 citation&READS&依赖 +a big data processing framework based&2,529 3 author&依赖 +mobile robot view project early detection&page&依赖 +Landmines detection&22 july 2016&依赖 +disease view project mohammed elmogy mansoura university 227 publication&1,801 citation&依赖 +a big data processing framework based&2,529 3 author&依赖 +mobile robot view project early detection&content&依赖 +author&publication&AGGREGATION +user&downloaded file&依赖 +user&enhancement&依赖 +enhancement&downloaded file&AGGREGATION +faculty and Egypt&Computers and Information&AGGREGATION +Faculty and Mansoura and Egypt&Computers and Information&AGGREGATION +mansoura university iot )&enormous storage challenge&依赖 +Internet&thing&AGGREGATION +Faculty&Computers and Information&AGGREGATION +IoT application&extensive development&依赖 +expansion&flow&依赖 +expansion&computational asset&AGGREGATION +flow&datum&AGGREGATION +expansion&datum&依赖 +expansion&significant effect&依赖 +vast flow&datum&AGGREGATION +vast flow&Big datum&依赖 +it&interesting information&依赖 +it&behavior and business intelligence&依赖 +user&behavior& +form&data resource&AGGREGATION +results and discussion&big datum iot-based smart application&依赖 +results and discussion&feasible solution&依赖 +we&clean noisy datum ( svd&依赖 +we&knn ) technique&依赖 +we&C-mean&依赖 +hybrid technique&C-mean&AGGREGATION +we&hybrid technique&依赖 +we&C-mean&依赖 +we&hybrid technique&依赖 +clustering technique&technique&GENERALIZATION +clustering technique&MapReduce model&实现 +MapReduce&most admit framework&依赖 +MapReduce&processing&依赖 +used technique&scalability&依赖 +it&huge dataset&依赖 +it&addition&依赖 +it&meaningful information&依赖 +accuracy&proposed framework&AGGREGATION +Internet iot )&thing&AGGREGATION +connection&variety&依赖 +connection&item&依赖 +connection&Internet&依赖 +connection&view information device&依赖 +variety&view information device&AGGREGATION +object&information&依赖 +object&information&依赖 +aim&perspective recognition&AGGREGATION +essential thought&thing&依赖 +essential thought&IoT&AGGREGATION +Figure 1&iot and big datum&依赖 +Figure 1&relationship&依赖 +sensor datum&big datum&依赖 +datum&IoT&依赖 +datum&most important part&依赖 +most important part&IoT&AGGREGATION +nature&IoT&AGGREGATION +billion&object&AGGREGATION +datum&IoT&依赖 +datum&sensor&依赖 +datum&nature&依赖 +various type&sensor&AGGREGATION +datum&various type&依赖 +It&discernment device&依赖 +datum&next challenges :&依赖 +datum&considered thing&依赖 +datum&next challenges :&依赖 +datum&considered thing&依赖 +huge number&discernment device&AGGREGATION +It&huge number&依赖 +massive scale&IoT&AGGREGATION +device&datum&依赖 +quick development&information scale&AGGREGATION +They&varied resources and heterogeneity&依赖 +They&IoT datum&依赖 +Different&observation gadget&AGGREGATION +varied resources and heterogeneity&IoT datum&AGGREGATION +gathered datum&different semantics and structure&依赖 +vast majority&IoT application&AGGREGATION +It&way&依赖 +IoT&communitarian&依赖 +IoT&data distribution&依赖 +2016 ) 4 interval&rescue vehicle&AGGREGATION +sort&assistant medical strategy&AGGREGATION +sort&what&依赖 +It&IoT&依赖 +It&principal issue&依赖 +application&IoT&AGGREGATION +It&application&依赖 +it&most part&依赖 +it&few sensor&依赖 +it&all while screen various pointer and dampness , light , and weight&依赖 +specimen information&line&依赖 +specimen information&information&GENERALIZATION +datum&volume&依赖 +they&mixed bag&依赖 +huge measure&dissimilar information&AGGREGATION +a thought or ideal model&gathering , and utilization&依赖 +Big datum&choice making&依赖 +Big datum&different universe&依赖 +Big datum&3 ]&依赖 +point&view&AGGREGATION +They&online networking&依赖 +They&sensor device&依赖 +big data 4v&volume , velocity , variety ,&依赖 +Figure 2&big data 4v&依赖 +It&big data sequence&依赖 +issues and technology&accessibility&依赖 +substantial volume&datum&AGGREGATION +accessibility&substantial volume&AGGREGATION +issues and technology&substantial volume&依赖 +organization&that&依赖 +issues and technology&datum&依赖 +rest&paper&AGGREGATION +Section 2&basic concept&依赖 +Section 3¤t related work&依赖 +Section 4&proposed system&依赖 +implementation result&proposed technique&AGGREGATION +implementation result&benchmark dataset&依赖 +implementation result&Section 5&依赖 +conclusion and future work&Section 6&依赖 +CONCEPTS&MapReduce&依赖 +one&perfect choice&AGGREGATION +perfect choice&programming paradigm&AGGREGATION +user&map function&依赖 +map function&pair&依赖 +map function&function&GENERALIZATION +map function&group&依赖 +pair&key-value&AGGREGATION +map function&intermediate key-value set&依赖 +group&intermediate key-value set&AGGREGATION +It&a reduce function&依赖 +MapReduce architecture&architecture&GENERALIZATION +MapReduce architecture&Figure 3&依赖 +MapReduce framework&framework&GENERALIZATION +reduce function&reduce function&实现 +MapReduce framework&large dataset&依赖 +mapper&mass&依赖 +mapper&datum&依赖 +mass&datum&AGGREGATION +reducer&intermediate result&依赖 +block diagram&MapReduce&AGGREGATION +block diagram&datum&依赖 +we&CSV extension&依赖 +we&data store&依赖 +we&dataset&依赖 +data store&datum&依赖 +data store&tabular datastore object&依赖 +we&' name&依赖 +we&dataset&依赖 +variables&names& +' name feature&working&依赖 +user&needs& +specified variable&need&AGGREGATION +' name feature&permit&依赖 +user&preview command&依赖 +Figure&generic map function&依赖 +function&coder&依赖 +We&intermediate key and intermediate value&依赖 +we&dataset&依赖 +we&specific value&依赖 +we&set&依赖 +we&key-value&依赖 +set&key-value&AGGREGATION +large dataset initialize datastore variable&large dataset&依赖 +Add&select variable&依赖 +Add&datastore&依赖 +block diagram&generic map function&AGGREGATION +c. map function datum key-value store subset term generic map function&intermediate key-value store add set&依赖 +intermediate key-value store add set&intermediate key&AGGREGATION +c. map function datum key-value store subset term generic map function&intermediate key&依赖 +condition&key-value pair Create output store&AGGREGATION +Map function&function&GENERALIZATION +partition&key-value condition&依赖 +partition&datum&AGGREGATION +Data Intermediate key-value store Map function&key-value pair Create output store&依赖 +Data Intermediate key-value store Map function&condition&依赖 +block diagram&map function&AGGREGATION +Figure 6&Map function&依赖 +Map function&table&依赖 +Map function&variable&依赖 +variables&property& +subset&selected key&依赖 +subset&condition value&依赖 +subset&dataset&AGGREGATION +condition value&value&GENERALIZATION +map function extract&subset&依赖 +condition value&selected key&AGGREGATION +map function extract&dataset&依赖 +Reduce&one key&依赖 +block diagram&of&AGGREGATION +block diagram&reduce function&依赖 +load&different point&AGGREGATION +we&one&依赖 +separation&Epsilon&AGGREGATION +piece&" cluster&AGGREGATION +one&them&AGGREGATION +we&them&依赖 +they&Create&依赖 +we&cluster&依赖 +more than minpoint point&Intermediate value Key-value store&AGGREGATION +greater part&new point&AGGREGATION +they&more than minpoint point&依赖 +Get all intermediate result&intermediate value&依赖 +output value&value&GENERALIZATION +Get all intermediate result&output value&依赖 +2016 ) 9 separation&epsilon&AGGREGATION +part&other group&AGGREGATION +its&Epsilon& +it&other group&依赖 +minpoint point&Epsilon&依赖 +minpoint point&Epsilon&依赖 +noise&point&GENERALIZATION +it&" noise point "&依赖 +whose&items& +each core-point c&edge&依赖 +each core-point c&edge&依赖 +each core-point c&c&依赖 +each core-point c&c&依赖 +each core-point c&edge&依赖 +each core-point c&edge&依赖 +each core-point c&c&依赖 +- neighborhood&c 3&AGGREGATION +each core-point c&edge&依赖 +each core-point c&c&依赖 +each core-point c&c&依赖 +item&graph&AGGREGATION +core point&point&GENERALIZATION +let x&1&依赖 +set&node&AGGREGATION +dataset&n cluster&依赖 +data point&cluster&依赖 +high level&relationship&AGGREGATION +cluster&relationship&依赖 +cluster&cluster&依赖 +cluster&high level&依赖 +data point&point&GENERALIZATION +data point&cluster&依赖 +data point&association&依赖 +data point&cluster&依赖 +low level&association&AGGREGATION +data point&low level&依赖 +center&cluster&AGGREGATION +technique&pattern recognition&依赖 +It&minimization&依赖 +It&objective function&依赖 +minimization&objective function&AGGREGATION +membership&xus&AGGREGATION +cj&following equation [ 6 ]&依赖 +cj¢er&依赖 +cj&cj = σn i = 1 ( mm ij&依赖 +| | * | |&measured datum&依赖 +ith&d-dimensional measured datum&AGGREGATION +degree&membership&AGGREGATION +cj&cluster&依赖 +d-dimension center&cluster&AGGREGATION +FCM sequentially&dataset&依赖 +FCM sequentially&right area&依赖 +FCM sequentially&cluster center&依赖 +FCM clustering strategy&fuzzy behavior&依赖 +they&method&依赖 +fuzzy behavior&issn :0254 -0223 vol&依赖 +7&clustering&依赖 +membership weight&a characteristic translation but not probabilistic&依赖 +membership weight&all&依赖 +outcome&item&依赖 +outcome&2.4 K&依赖 +estimation&item&AGGREGATION +outcome&KNN regression&依赖 +its&neighbors& +value&k-closest neighbor&依赖 +value&estimation&依赖 +estimation&k-closest neighbor&AGGREGATION +Euclidean distance&distance&GENERALIZATION +KNN&Euclidean distance&依赖 +KNN&labeled example&依赖 +KNN&Euclidean distance&依赖 +KNN&following equation [ 8 ]&依赖 +it&overall noise&依赖 +top K-number&adjacent neighbor&AGGREGATION +labeled example&highest distance&依赖 +It&detail&依赖 +n row&datum&依赖 +2.5 SINGULAR VALUE DECOMPOSITION SVD&datum&依赖 +rectangular matrix&datum&AGGREGATION +p column&experimental property&依赖 +2.5 SINGULAR VALUE DECOMPOSITION SVD&rectangular matrix&依赖 +same dimension&singular value&依赖 +VT&row&依赖 +SVD&outline&依赖 +SVD&coordinate system&依赖 +coordinate system&system&GENERALIZATION +SVD&original datum&依赖 +outline&original datum&AGGREGATION +eigenvector&a relate to&AGGREGATION +SVD&equation&依赖 +x&a relate to&依赖 +eigenvalue&A&AGGREGATION +eigenvalues and eigenvector&AAT or ATA&AGGREGATION +computation&SVD&AGGREGATION +singular value&AAT or ATA&依赖 +column&V&AGGREGATION +column&U.&AGGREGATION +eigenvector&column&依赖 +singular value&AAT or ATA&依赖 +eigenvector&V&依赖 +singular value&AAT or ATA&依赖 +eigenvector&U.&依赖 +singular value&eigenvalue&依赖 +square root&eigenvalue&AGGREGATION +singular value&AAT or ATA&依赖 +singular value&eigenvalue&依赖 +eigenvector&ATA&AGGREGATION +eigenvector&column&依赖 +eigenvector&AAT&AGGREGATION +diagonal entry&S matrix&AGGREGATION +S matrix&matrix&GENERALIZATION +singular value&S matrix&依赖 +SVD feature&matrix&依赖 +SVD feature&nearest rank-l estimation&依赖 +number&outstanding singular value&AGGREGATION +we&matrix estimation&依赖 +whose&rank& +whose rank&outstanding singular value&依赖 +whose rank&number&依赖 +7&important research topic&依赖 +Many researcher&field&依赖 +MapReduce technique&technique&GENERALIZATION +Tao&MapReduce technique&依赖 +light&K-means clustering calculation&AGGREGATION +They&light&依赖 +They&K-means clustering calculation&依赖 +They&monstrous little datum&依赖 +They&procedure&依赖 +outcome&information prepare proficiency&依赖 +Their&outcomes& +They&Kmeans calculation&依赖 +They&Kmeans calculation&依赖 +They&MapReduce&依赖 +They&view&依赖 +view&MapReduce&AGGREGATION +they&datum&依赖 +they&record&依赖 +they&converging&依赖 +converging&datum&AGGREGATION +they&cluster&依赖 +datum&high likeness&依赖 +datum&high likeness&依赖 +merger technique&technique&GENERALIZATION +merger technique&little information&AGGREGATION +exploration&little information&依赖 +exploration&them&依赖 +exploration&IoT&依赖 +exploration&merger technique&依赖 +number&cluster&AGGREGATION +Xu and Xun [ 11 ]&distributed computing&依赖 +MapReduce model&distributed computing&AGGREGATION +Xu and Xun [ 11 ]&MapReduce model&依赖 +they&MapReduce&依赖 +instrument&MapReduce&AGGREGATION +they&instrument&依赖 +key innovation&IoT&AGGREGATION +they&structural planning attribute&依赖 +They&IoT world&依赖 +IoT world&world&GENERALIZATION +They&information and datum&依赖 +They&conveyed mining&依赖 +they&stream information distribution&依赖 +deficiency&conventional Apriori calculation&AGGREGATION +Apriori&lower mining proficiency&依赖 +mining technique&stream information investigation , group and so on&依赖 +mining technique&technique&GENERALIZATION +mining technique&stream information investigation , group and so on&依赖 +mining technique&stream information investigation , group and so on&依赖 +They&system&依赖 +security&information&AGGREGATION +proposed system&low effectiveness&依赖 +its&usage& +Wang et al. [ 12 ]&structural planning&依赖 +Wang et al. [ 12 ]&agribusiness&依赖 +structural planning&IoT&AGGREGATION +Wang et al. [ 12 ]&IoT&依赖 +IoT&distributed processing&依赖 +structural planning&enormous sensor information&依赖 +sensor information&information&GENERALIZATION +structural planning&constant read or access&依赖 +XML document&standard&依赖 +organization&heterogeneous sensor datum&AGGREGATION +XML document&heterogeneous sensor datum&依赖 +XML document&standard&依赖 +XML document&organization&依赖 +lack&variety&AGGREGATION +variety&sensor datum&AGGREGATION +ClustBigFIM method&method&GENERALIZATION +Gole and Tidk [ 13 ]&ClustBigFIM method&依赖 +improvement&BigFIM algorithm&AGGREGATION +ClustBigFIM&BigFIM algorithm&依赖 +improvement&information&依赖 +improvement&velocity&依赖 +other data mining mission&good vision&依赖 +They&manner&依赖 +manner&association&AGGREGATION +They&association&依赖 +It&frequent item&依赖 +flow&information&AGGREGATION +It&Big datum&依赖 +Li et al. [ 1 ]&storage managing clarification&依赖 +storage managing clarification&managing clarification&GENERALIZATION +They&managing clarification&依赖 +IOTMDB&save&依赖 +Their&work& +they&addition&依赖 +they&massive IoT data ISSN :0254 -0223 Vol&依赖 +its&value& +diverse structure&sensor&依赖 +Mesiti and Valtolina [ 14 ]&structure&依赖 +diverse structure&sensor&依赖 +information accumulation&database&依赖 +they&answer&依赖 +world&Big information investigation strategy&依赖 +answer&information&依赖 +answer&heterogeneous sensor&依赖 +NoSQL framework&framework&GENERALIZATION +NoSQL framework&reasonable mapping&依赖 +They&easy to use loading framework&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +zhan et al. [ 15 ]&zhan et al. [ 15 ]&依赖 +zhan et al. [ 15 ]&massive data processing model&依赖 +type&data resource&AGGREGATION +Their&model& +They&two main point&依赖 +they&cloudf&依赖 +they&Cloud Manager DB&实现 +variety&datum&AGGREGATION +galache et al. [ 16 ]&galache et al. [ 16 ]&依赖 +galache et al. [ 16 ]&galache et al. [ 16 ]&依赖 +nation&city resource&依赖 +Their&issue& +they&care&依赖 +they&resource&依赖 +they&resource&依赖 +they&resource&依赖 +they&resource&依赖 +they&care&依赖 +care&resource&AGGREGATION +they&care&依赖 +they&care&依赖 +set&smart IoT service&AGGREGATION +proposed framework&three-layer architecture&依赖 +asset&effective IoT benefit&依赖 +asset&Cloud&依赖 +Sowe et al.&answer&依赖 +Sowe et al.&massive heterogeneous sensor information issue&依赖 +They&join&依赖 +distinctive type&information&AGGREGATION +It&key middleware&依赖 +It&a service-controlled networking ( scn )&依赖 +It&sensor information&依赖 +It&client&依赖 +They&harvester ( udh ) advancement&依赖 +They&SCN&依赖 +portable detecting information&paper&依赖 +They&structure&依赖 +Cecchinel et al. [ 18 ]&programming structure&依赖 +programming structure&structure&GENERALIZATION +structure&dataset&依赖 +utilization measure&dataset&AGGREGATION +dataset&SMARTCAMPUS venture&依赖 +structural engineering&SMARTCAMPUS venture&依赖 +Their&engineering& +structural engineering&genuine prerequisite&依赖 +work&Big data accumulation stage&依赖 +work&way&依赖 +work&Big data accumulation stage&依赖 +way&Big data accumulation stage&AGGREGATION +work&way&依赖 +They&ISSN :0254 -0223 Vol&依赖 +its&applications& +programming model&client&依赖 +system programming&off chance&依赖 +they&new information&依赖 +Mishra et al. [ 19 ]&valuable data administration and knowledge detection&依赖 +Mishra et al. [ 19 ]&IoT Big datum&依赖 +Mishra et al. [ 19 ]&a cognitive-oriented iot big-da framework ( coib )&依赖 +They&huge scale mechanical computerization environment&依赖 +They&general IoT Big data layered design&依赖 +They&COIB system&依赖 +usage&COIB system&AGGREGATION +They&general IoT Big data layered design&依赖 +COIB system&system&GENERALIZATION +They&mining and examination huge information&依赖 +proposed system&store and retrieve iot big datum&依赖 +trillion&IoT item&AGGREGATION +proposed system&solution&依赖 +their&work& +our&system& +accuracy&datum&AGGREGATION +proposed system&datum&依赖 +proposed system&datum&依赖 +proposed system&massive number&依赖 +massive number&datum&AGGREGATION +proposed system&massive number&依赖 +we&datum&依赖 +we&noise&依赖 +we&big datum&依赖 +we&kennard sample and svd&依赖 +data reduction technique&big datum&依赖 +data reduction technique&IoT&依赖 +we&IoT&依赖 +we&mutual information algorithm&依赖 +we&datum clustering&依赖 +we&vast store&依赖 +we&MapReduce&依赖 +proposed system figure 8&proposed system figure 8&依赖 +proposed system&massive – heterogeneous sensor datum&AGGREGATION +proposed system&massive – heterogeneous sensor datum&依赖 +noiseless datum issn :0254 -0223 vol&noiseless datum issn :0254 -0223 vol&依赖 +Variety sensor raw datum datum cleaning data integration datum processing ( clustering&sensor raw datum datum cleaning data integration datum processing ( clustering&AGGREGATION +Variety sensor raw datum datum cleaning data integration datum processing ( clustering&little size clean&依赖 +Variety sensor raw datum datum cleaning data integration datum processing ( clustering&datum datum&依赖 +Variety sensor raw datum datum cleaning data integration datum processing ( clustering&datum datum&依赖 +Variety sensor raw datum datum cleaning data integration datum processing ( clustering&little size clean&依赖 +proposed system&data preprocessing and data processing phase&依赖 +proposed system&two main phase&依赖 +dataset&stage&依赖 +dataset&different sensor&依赖 +utilization&kennard sampling&AGGREGATION +dimensionality&datum&AGGREGATION +execution time&datum processing&AGGREGATION +last stage&correlation and mutual information&依赖 +last stage&aiming&依赖 +it&data distribution&依赖 +performance&big datum&AGGREGATION +main stage&detail&依赖 +main stage&subsection&依赖 +main stage&two phase&AGGREGATION +preprocessing&data science&依赖 +it&choice&依赖 +data mining method&raw datum&依赖 +data mining method&reasonable information&依赖 +It&association&依赖 +It&database-driven application&依赖 +step&detail&依赖 +step&subsection&依赖 +confusion&real information&依赖 +more than 30 %&real information&AGGREGATION +confusion&more than 30 %&依赖 +it&addition&依赖 +it&cost [ 21 ]&依赖 +It&datum&依赖 +rest&datum&AGGREGATION +it&size&依赖 +nominal attribute&datum&AGGREGATION +It&datum&依赖 +help&many technique&AGGREGATION +mean&numeric attribute or mode&AGGREGATION +It&KNN algorithm&依赖 +It&discrete and continuous attribute&依赖 +knn search&datum&依赖 +It&dataset&依赖 +It&most probable value&依赖 +We&data cleaning&依赖 +We&KNN algorithm&依赖 +block diagram&datum cleaning step&AGGREGATION +block diagram&datum cleaning step&依赖 +figure 9 show&noisy data and outlier&依赖 +figure 9 show&many challenge&依赖 +repetition&datum&AGGREGATION +value&KNN regression&依赖 +value&most probable value&依赖 +b ) data reduction a monstrous measure&different source&依赖 +b ) data reduction a monstrous measure&different source&依赖 +logistics insight&example&依赖 +logistics insight&r&d [ 23 ]&依赖 +logistics insight&r&d [ 23 ]&依赖 +b ) data reduction a monstrous measure&information&AGGREGATION +extraordinary difficulty term&computational manysided quality and characterization execution&AGGREGATION +Highdimensional information&computational manysided quality and characterization execution&依赖 +Highdimensional information&extraordinary difficulty term&依赖 +it&low-dimensional component space&依赖 +block diagram&data reduction step&依赖 +block diagram&data reduction step&依赖 +list&highest smallest distance&AGGREGATION +Kennard sample&time&依赖 +Kennard sample&number&依赖 +Kennard sample&iteration&依赖 +number&iteration&AGGREGATION +We&SVD&依赖 +dimensionality&large dimensional datum&AGGREGATION +SVD Input data De-duplication Detect outlier Replace&value&依赖 +SVD Input data De-duplication Detect outlier Replace&Input datum&依赖 +purpose&access&AGGREGATION +Big datum&huge volume&依赖 +Big datum&organization&依赖 +number&different source&AGGREGATION +It&diverse structure&依赖 +It&–&依赖 +this immense , various sort&information&AGGREGATION +organization&speedy , exact , and significant bit&依赖 +organization&knowledge [ 26 ]&依赖 +speedy , exact , and significant bit&knowledge [ 26 ]&AGGREGATION +Mutual information&relationship&依赖 +Mutual information&attribute&依赖 +( y ) ] ( 10 )&( x&依赖 +( y ) ] ( 10 )&[ 27 ]&依赖 +equation&mutual information&AGGREGATION +( y ) ] ( 10 )&y ) log2 [ p ( x&依赖 +two dimension&dataset&AGGREGATION +X and Y&dataset&依赖 +control&information processing&AGGREGATION +information processing&processing&GENERALIZATION +Data Processing Phase Data processing phase&information processing&依赖 +handling&information&AGGREGATION +Information preparation&handling&依赖 +Information preparation&information&依赖 +Massive datum&processing&依赖 +Massive datum&data store&依赖 +tremendous measure&comparative quality&AGGREGATION +We&MapReduce&依赖 +hybrid&FCM and DBSCAN&AGGREGATION +We&MapReduce&依赖 +We&MapReduce&依赖 +minimum point&minimum value&依赖 +minimum value&point&AGGREGATION +FCM-DBSCAN Map function&Map function&GENERALIZATION +minimum point&point&依赖 +we&FCM-DBSCAN Map function&依赖 +epsilon value¢er and point&依赖 +epsilon value&distance&依赖 +minimum point&cluster&依赖 +we&minimum point&依赖 +we&equation&依赖 +points and center&cluster&AGGREGATION +we¢er&依赖 +we&cluster&依赖 +epsilon value&value&GENERALIZATION +point and center&cluster equal&AGGREGATION +distance&greater&依赖 +distance&greater&依赖 +distance&epsilon value&依赖 +point&cluster&依赖 +point&neighborpt&依赖 +distance&epsilon value&依赖 +distance&greater&依赖 +distance&epsilon value&依赖 +point&cluster&依赖 +We&key&依赖 +It&reach&依赖 +It&convergence state&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&number&依赖 +7 and 2016 ) 17 fcm-dbscan map function fcm-dbscan ( d&cluster&依赖 +each point p&dataset D&依赖 +each point p&each point p&依赖 +each point p&dataset D&依赖 +each point p&each point p&依赖 +each point p&each point p&依赖 +each point p&dataset D&依赖 +input&FCM-DBSCAN Reduce function&依赖 +final cluster point&C cluster&依赖 +final cluster point&previous cluster point&依赖 +final cluster point&addition&依赖 +C cluster&cluster&GENERALIZATION +final cluster point¤t cluster point&依赖 +a point issn&:0254 -0223 Vol&依赖 +all point&cluster&依赖 +all point&cluster&依赖 +all point&cluster&依赖 +neighbor point&minimum point&依赖 +minimum point&point&GENERALIZATION +neighbor point&neighbor point&依赖 +output&datum&依赖 +output&cluster&依赖 +cluster&datum&AGGREGATION +set&cluster&AGGREGATION +raw datum&different sensor&依赖 +problem&sensor datum&依赖 +our propose work aim&problem&依赖 +Our&work& +raw datum&different sensors and store&依赖 +we&datum&依赖 +datum&noise&依赖 +datum&KNN&依赖 +We&KNN&依赖 +cleared datum&SVD algorithm&依赖 +significant vision&datum&AGGREGATION +it&time&依赖 +proposed model&data processing step&依赖 +clustering algorithm&entity&依赖 +clustering algorithm&space&依赖 +arrangement&entity&AGGREGATION +clustering algorithm&arrangement&依赖 +It&diverse forms and size&依赖 +It&cluster&依赖 +It&diverse forms and size&依赖 +It&cluster&依赖 +It&diverse forms and size&依赖 +huge quantity&datum&AGGREGATION +It&diverse forms and size&依赖 +cluster&diverse forms and size&AGGREGATION +It&cluster&依赖 +It&cluster&依赖 +RESULTS&29 ]&依赖 +RESULTS&ordinary IADL housekeeping activity&依赖 +general interval&dataset&AGGREGATION +usual spreading&activity&AGGREGATION +interval&activity&依赖 +interval&usual spreading&依赖 +interval&daily life&依赖 +Porcupine sensor&sensor&GENERALIZATION +They&ibracelet&依赖 +They&acceleration and RFID tag detection&依赖 +They&Porcupine sensor&依赖 +dataset&estimation&依赖 +estimation&1048576 record&AGGREGATION +dataset&1048576 record&依赖 +We&proposed technique and core ( tm )&实现 +We&2 due , 2 gh processor&实现 +part&used dataset&AGGREGATION +Figure 11&part&依赖 +Figure 11&used dataset&依赖 +act&activity label result&依赖 +act&iron&依赖 +Acc&acc3&依赖 +beginning&recording&AGGREGATION +Time&elapsed number&依赖 +Time&second&依赖 +elapsed number&second&AGGREGATION +Acc&real time clock [ ddmmyyhhmmss ]&依赖 +Time&elapsed number&依赖 +Time&second&依赖 +Time&second&依赖 +Time&elapsed number&依赖 +Figure 12&outlier detection&依赖 +value&state&依赖 +value&outlier&依赖 +value&state&依赖 +value&outlier&依赖 +value&field&AGGREGATION +observation&value&依赖 +expected scope&value&AGGREGATION +observation&experiment&依赖 +outlier&measurement or experimental error indication&依赖 +outlier&dataset&依赖 +outlier&value&依赖 +figure 13 show&value&依赖 +reduction&dataset&AGGREGATION +datum&property&依赖 +smaller number&property&AGGREGATION +datum&smaller number&依赖 +attribute&priority&依赖 +attribute&priority&依赖 +SVD1&present the datum&依赖 +SVD1&highest probability&依赖 +outcome matrix&matrix&GENERALIZATION +Figure 15&outcome matrix&依赖 +Figure 15&mutual information&依赖 +measure&two variable&依赖 +measure&two variable&依赖 +measure&variables mutual dependence&AGGREGATION +trans-information&two variable&AGGREGATION +mutual information&association or correlation&依赖 +rate&association or correlation&AGGREGATION +mutual information&row and column variable&依赖 +mutual information&rate&依赖 +mutual information&2N&依赖 +mutual information&datum&依赖 +it&high relationship&依赖 +value&mutual information&AGGREGATION +it&attribute&依赖 +MapReduce function execution&MapReduce implementation&依赖 +MapReduce function execution&result datum&依赖 +Figure 16&read datum&依赖 +read datum&resulted attributes view&依赖 +read datum&resulted attributes view&依赖 +read datum&set&依赖 +Figure 16&MapReduce&依赖 +Figure 16&MapReduce&依赖 +read datum&set&依赖 +Figure 16&read datum&依赖 +set&resulted attributes view&AGGREGATION +Figure 17&MapReduce implementation&依赖 +dataset&Map&依赖 +we&MapReduce implementation&依赖 +we&data result&依赖 +preprocessing&dataset&AGGREGATION +time and accuracy&dataset&AGGREGATION +5.4 EVALUATION The evaluation&dataset&依赖 +5.4 EVALUATION The evaluation&time and accuracy&依赖 +time and accuracy&preprocessing&AGGREGATION +value&specificity&AGGREGATION +we&Big datum&依赖 +we&accuracy&依赖 +accuracy&Big datum&AGGREGATION +negative tuple&FN False negative&依赖 +positive tuple&ISSN :0254 -0223 Vol&依赖 +negative tuple&TN True negative&依赖 +positive tuple&FP False Positives&依赖 +our&FCM-DBSCAN& +clustering algorithm&PCA&依赖 +clustering algorithm&different data reduction algorithm&依赖 +we&table 2&依赖 +we&dataset&依赖 +we&proposed approach&依赖 +we&training data and testing datum&依赖 +we&tested datum&依赖 +performance measure&proposed system&AGGREGATION +expended time comparison&different reduction algorithm&依赖 +expended time comparison&different reduction algorithm&依赖 +we&high accuracy value&依赖 +its&approaches& +our&studies& +FCM-DBSCAN&accuracy&依赖 +FCM-DBSCAN&highest value&依赖 +FCM-DBSCAN&accuracy&依赖 +FCM-DBSCAN&highest value&依赖 +highest value&accuracy&AGGREGATION +K-Means and optics&nearest accuracy value&依赖 +optics&longer time&依赖 +EM algorithm&other technique&依赖 +EM algorithm&larger time&依赖 +DBSCAN&high accuracy&依赖 +accuracy&FCM-DBSCAN&依赖 +vast increase&device&AGGREGATION +massive amount&IoT datum&AGGREGATION +Big datum&massive datum&依赖 +massive datum&much time&依赖 +We&processing massive and heterogeneous datum&依赖 +We&IoT&依赖 +We&framework&依赖 +paper&Big datum&依赖 +paper&many viewpoint&依赖 +raw dataset&different sensor&依赖 +Our&system& +proposed system&problem&依赖 +architecture&optics em dbscan fcm-dbscan pca pca kernel ica som svd issn :0254 -0223 vol&依赖 +architecture&proposed system&AGGREGATION +architecture&optics em dbscan fcm-dbscan pca pca kernel ica som svd issn :0254 -0223 vol&依赖 +we&preprocessing phase&依赖 +datum&most probable value&依赖 +we&KNN&依赖 +MapReduce model&datum clustering&依赖 +MapReduce model&datum clustering&依赖 +MapReduce model&Map and Reduce function&依赖 +MapReduce model&Map and Reduce function&依赖 +MapReduce model&datum clustering&依赖 +MapReduce model&datum clustering&依赖 +MapReduce model&Map and Reduce function&依赖 +MapReduce model&Map and Reduce function&依赖 +processing time&proposed system&AGGREGATION +we&processing&实现 +we&processing&实现 +we&different dataset&实现 +we&different dataset&实现 +future work&time&依赖 +we&data query processing&实现 +best and suitable model&NoSQL database&AGGREGATION +NoSQL database&database&GENERALIZATION +we&NoSQL database&实现 +we&best and suitable model&实现 +We&Key-value database&依赖 +Key-value database&database&GENERALIZATION +key-value ( kv ) store&associative array&依赖 +approach&selective key range&依赖 +we&challenge&依赖 +[&1 ] li , t. , liu , y. , tian , y. , shen , s. , & mao and w. ( 2012 ) w. ( 2012 )&依赖 +Improvement&Analyze Cluster&依赖 +Improvement&Large dataset&依赖 +Improvement&dbscan algorithm&AGGREGATION +Improvement&Large dataset&依赖 +Improvement&Large dataset&依赖 +Improvement&Large dataset&依赖 +Improvement&Analyze Cluster&依赖 +Improvement&Analyze Cluster&依赖 +Improvement&Analyze Cluster&依赖 +Comparative Analysis&k-mean&AGGREGATION +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&c. ( 2014 )&依赖 +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&7 jan 2016&依赖 +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&c. ( 2014 )&依赖 +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&依赖 +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&7 jan 2016&依赖 +http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&http://web.mit.edu/be.400/www/svd/singular_value_decomposition.htm [ 10 ] tao , x. & ji and c. ( 2014 )&依赖 +Management&Big Data&AGGREGATION +analysis&big datum&AGGREGATION +38th ieee annual international computers and Software&38th ieee annual international computers and Software&依赖 +management&massive IoT datum&AGGREGATION +collection&big datum&AGGREGATION +cognitive-oriented framework&iot big-data management prospective&依赖 +cognitive-oriented framework&iot big-data management prospective&依赖 +International Journal and ijact ) , 7 ( 5 ) and ijact ) , 7 ( 5 )&Advancements&AGGREGATION +new approachµarray data dimension reduction&AGGREGATION +data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&8 july 2015&依赖 +data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&http://data-informed.com/how-to-address-commonbig&依赖 +data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&8 july 2015&依赖 +data-pain-points [ 27 ] cover , t. & thomas and j. ( 2012 )&http://data-informed.com/how-to-address-commonbig&依赖 +element&information theory&AGGREGATION +Combination&RFID&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt new file mode 100644 index 0000000000000000000000000000000000000000..80c99c07f0aa3295882dd0ec4cedf029dc5953fc --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-simEnts.txt @@ -0,0 +1,74 @@ +Input,Input +Input,class TextInputFormat +Input,class SequenceFileInputFormat +Input,class CombineFileInputFormat +Input,class KeyValueTextInputFormat +Input,class FixedLengthInputFormat +Input,class NLineInputFormat +Input,class CombineFileRecordReader +Input,class KeyValueLineRecordReader +Input,class SequenceFileRecordReader +Input,class DBRecordReader +TextInputFormat,TextInputFormat +SequenceFileInputFormat,SequenceFileInputFormat +CombineFileInputFormat,CombineFileInputFormat +KeyValueTextInputFormat,KeyValueTextInputFormat +KeyValueTextInputFormat,KeyFilter +KeyValueTextInputFormat,Key questions +FixedLengthInputFormat,FixedLengthInputFormat +NLineInputFormat,NLineInputFormat +CombineFileRecordReader,CombineFileRecordReader +KeyValueLineRecordReader,KeyValueLineRecordReader +KeyValueLineRecordReader,Key questions +SequenceFileRecordReader,SequenceFileRecordReader +DBRecordReader,DBRecordReader +Map,Map +InverseMapper,InverseMapper +MultithreadedMapper,MultithreadedMapper +RegexMapper,RegexMapper +TokenCounterMapper,TokenCounterMapper +Partition,Partition +Partition,class KeyFieldBasedPartitioner +BinaryPartitioner,BinaryPartitioner +HashPartitioner,HashPartitioner +HashPartitioner,default Partitioner +KeyFieldBasedPartitioner,KeyFieldBasedPartitioner +KeyFieldBasedPartitioner,Key idea +RehashPartitioner,RehashPartitioner +TotalOrderPartitioner,TotalOrderPartitioner +Reduce,Reduce +Reduce,class IntSumReducer +Reduce,class LongSumReducer +Reduce,class FailJob +IntSumReducer,IntSumReducer +IntSumReducer,Reducer interfaces +IntSumReducer,ReducerFactory +IntSumReducer,Reducer aggregate +IntSumReducer,ReducerPhase +IntSumReducer,Reducer implementations +LongSumReducer,LongSumReducer +LongSumReducer,Reducer interfaces +LongSumReducer,ReducerFactory +LongSumReducer,Reducer aggregate +LongSumReducer,ReducerPhase +LongSumReducer,Reducer implementations +Output,Output +Output,class FileOutFormat +Output,class MapFileOutputFormat +Output,class SequenceFileOutputFormat +Output,class TextOutputFormat +Output,class MultipleOutputs +Output,class FileOutputCommitter +Output,class RecordWriter +MapFileOutputFormat,MapFileOutputFormat +MapFileOutputFormat,method Map +MapFileOutputFormat,Map Reduce papers +MapFileOutputFormat,MapTask +MapFileOutputFormat,FacebookMap +Map,class InverseMapper +Map,class MultithreadedMapper +Map,class RegexMapper +Map,class TokenCounterMapper +Map,class WrappedMapper +SequenceFileOutputFormat,SequenceFileOutputFormat +TextOutputFormat,TextOutputFormat diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt index 35baf04f2ad62a91e0807d6c16c81b2a986baf60..88c9d8c94333d163ebd0526cefd8092f6fc7e556 100644 --- a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt @@ -1,4 +1,14 @@ Input , Input +Input , class Text Input Format +Input , class Sequence File Input Format +Input , class Combine File Input Format +Input , class Key Value Text Input Format +Input , class Fixed Length Input Format +Input , class N Line Input Format +Input , class Combine File Record Reader +Input , class Key Value Line Record Reader +Input , class Sequence File Record Reader +Input , class D B Record Reader Text Input Format , Text Input Format Sequence File Input Format , Sequence File Input Format Combine File Input Format , Combine File Input Format @@ -18,6 +28,7 @@ Multithreaded Mapper , Multithreaded Mapper Regex Mapper , Regex Mapper Token Counter Mapper , Token Counter Mapper Partition , Partition +Partition , class Key Field Based Partitioner Binary Partitioner , Binary Partitioner Hash Partitioner , Hash Partitioner Hash Partitioner , default Partitioner @@ -26,6 +37,9 @@ Key Field Based Partitioner , Key idea Rehash Partitioner , Rehash Partitioner Total Order Partitioner , Total Order Partitioner Reduce , Reduce +Reduce , class Int Sum Reducer +Reduce , class Long Sum Reducer +Reduce , class Fail Job Int Sum Reducer , Int Sum Reducer Int Sum Reducer , Reducer interfaces Int Sum Reducer , Reducer Factory @@ -39,10 +53,22 @@ Long Sum Reducer , Reducer aggregate Long Sum Reducer , Reducer Phase Long Sum Reducer , Reducer implementations Output , Output +Output , class File Out Format +Output , class Map File Output Format +Output , class Sequence File Output Format +Output , class Text Output Format +Output , class Multiple Outputs +Output , class File Output Committer +Output , class Record Writer Map File Output Format , Map File Output Format Map File Output Format , method Map Map File Output Format , Map Reduce papers Map File Output Format , Map Task Map File Output Format , Facebook Map +Map , class Inverse Mapper +Map , class Multithreaded Mapper +Map , class Regex Mapper +Map , class Token Counter Mapper +Map , class Wrapped Mapper Sequence File Output Format , Sequence File Output Format Text Output Format , Text Output Format \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" new file mode 100644 index 0000000000000000000000000000000000000000..f54af0f02683226c9ba822d66779242a90cd04d3 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce-relation.txt" @@ -0,0 +1,214 @@ +apache hadoop architecture – hdf&Explore&依赖 +architecture&Hadoop&AGGREGATION +we&Hadoop Architecture&依赖 +we&article&依赖 +article&Hadoop architecture&依赖 +Hadoop architecture&architecture&GENERALIZATION +component&Hadoop architecture&AGGREGATION +we&Hadoop architecture&依赖 +we&Hadoop architecture&依赖 +we&detail&依赖 +we&Hadoop Architecture diagram&依赖 +inexpensive , reliable , and scalable framework&big datum&依赖 +goal&inexpensive , reliable , and scalable framework&依赖 +large dataset&sizes and format&AGGREGATION +master-slave architecture&architecture&GENERALIZATION +Hadoop&datum&依赖 +vast amount&datum&AGGREGATION +Hadoop&master-slave architecture&依赖 +Hadoop&vast amount&依赖 +master node&task&依赖 +master node&slave node&依赖 +Slave node&actual business datum&依赖 +Hadoop architecture&three layer&依赖 +Management&component& +hdf and YARN&Hadoop Framework&依赖 +core component&Hadoop Framework&AGGREGATION +us&three core component&依赖 +It&Hadoop&依赖 +file&block-size chunk&依赖 +block&slave node&依赖 +block&cluster&依赖 +we&requirement&依赖 +block size&128 mb&依赖 +block size&128 mb&依赖 +we&which&依赖 +block size&default&依赖 +our&requirements& +block size&default&依赖 +HDFS&master-slave architecture&依赖 +HDFS&Hadoop&依赖 +It&NameNode and DataNode&依赖 +It&two daemon&依赖 +master node&node&GENERALIZATION +NameNode NameNode store&block&依赖 +NameNode NameNode store&names , information&依赖 +NameNode NameNode store&file&依赖 +NameNode NameNode store&block&依赖 +block&file&AGGREGATION +NameNode NameNode store&names , information&依赖 +NameNode NameNode store&file&依赖 +It&Datanodes&依赖 +slave node&actual business datum&依赖 +It&client read/write request&依赖 +namenode store&metada&依赖 +namenode store&metada&依赖 +datanodes store&file&依赖 +datanodes store&block&依赖 +namenode store&block location&依赖 +namenode store&block location&依赖 +It&Hadoop&依赖 +data processing layer&Hadoop&AGGREGATION +application&vast amount&依赖 +application&vast amount&依赖 +application&petabyte&依赖 +application&datum&依赖 +application&terabyte&依赖 +application&datum&依赖 +application&petabyte&依赖 +application&datum&依赖 +application&terabyte&依赖 +application&terabyte&依赖 +application&petabyte&依赖 +application&terabyte&依赖 +cluster&commodity hardware&AGGREGATION +application&datum&依赖 +application&petabyte&依赖 +application&vast amount&依赖 +application&vast amount&依赖 +MapReduce framework&framework&GENERALIZATION +MapReduce framework&< key , value > pair&依赖 +MapReduce job&work&依赖 +unit&work&AGGREGATION +MapReduce job&job&GENERALIZATION +MapReduce job&input datum&依赖 +MapReduce job&MapReduce program&依赖 +Hadoop&MapReduce job&依赖 +two type&task&AGGREGATION +Hadoop YARN&task&依赖 +Hadoop YARN&YARN&GENERALIZATION +they&unfavorable condition&依赖 +user&map function&依赖 +map function&function&GENERALIZATION +function&map task&AGGREGATION +output&map task&AGGREGATION +output&reduce task&依赖 +output&reduce task&依赖 +map task&task&GENERALIZATION +Reduce task&map task&依赖 +Reduce task&output&依赖 +Reduce task&aggregation&依赖 +MapReduce task&two phase&依赖 +MapReduce task&task&GENERALIZATION +Hadoop&input&依赖 +Hadoop&input&依赖 +Hadoop&fixed-size split&依赖 +RecordReader&record&依赖 +it&records itself&依赖 +RecordReader&split&依赖 +one map task&a user-defined function call map function&依赖 +Hadoop&map phase&依赖 +one map task&input split&依赖 +Hadoop&one map task&依赖 +input split&split&GENERALIZATION +one map task&record&依赖 +It&zero or multiple intermediate key-value pair&依赖 +It&map task output&依赖 +map task&output&依赖 +map task&local disk&依赖 +its&output& +Hadoop&combiner function&依赖 +combiner function&function&GENERALIZATION +Hadoop&user&依赖 +combiner group&map phase&依赖 +combiner group&datum&依赖 +combiner group&map phase&依赖 +combiner group&datum&依赖 +output&map function&AGGREGATION +It&map function&依赖 +It&output&依赖 +their&output& +map task partition&output&依赖 +their&values& +Hadoop&user&依赖 +Hadoop&partitioning&依赖 +Reducer task&a shuffle and sort step&依赖 +Reducer task&task&GENERALIZATION +main purpose&phase&AGGREGATION +main purpose&equivalent key&依赖 +sort and shuffle phase download&datum&依赖 +It&data piece&依赖 +It&large data list&依赖 +MapReduce framework&sort&依赖 +we&it&依赖 +sort and shuffling&framework&依赖 +developer&control&依赖 +developer&control&依赖 +Reducer&key grouping&依赖 +it&zero or more key-value pair&依赖 +it&OutputFormat&依赖 +Hadoop HDFS&HDFS&GENERALIZATION +reduce task output&Hadoop HDFS&依赖 +It&reducer output&依赖 +reducer output&output&GENERALIZATION +it&default&依赖 +it&key&依赖 +YARN YARN&YARN&GENERALIZATION +YARN YARN&Resource Negotiator&依赖 +resource management layer&Hadoop&AGGREGATION +It&Hadoop 2&依赖 +YARN&separate daemon&依赖 +YARN&functionality&依赖 +YARN&job scheduling&依赖 +YARN&idea&依赖 +job scheduling&scheduling&GENERALIZATION +functionality&job scheduling&AGGREGATION +basic idea&global ResourceManager and application Master&依赖 +application&job&依赖 +single job or DAG&job&AGGREGATION +basic idea&application&依赖 +YARN&ResourceManager and NodeManager&依赖 +apache hadoop yarn 1&apache hadoop yarn 1&依赖 +It&resource&依赖 +It&cluster&依赖 +It&application&依赖 +It&two main component&依赖 +Scheduler&resource&依赖 +Scheduler&running&依赖 +Scheduler&capacities , queues , etc&依赖 +Scheduler&resource&依赖 +It&application&依赖 +It&status&依赖 +Scheduler&restart&依赖 +Scheduler&failed task&依赖 +restart&failed task&AGGREGATION +resource requirement&application&AGGREGATION +It&scheduling&依赖 +ApplicationManager&first container&依赖 +their&status& +It&status and progress&依赖 +It&machine resource usage&依赖 +It&nodemanager (&依赖 +we&article&依赖 +we&Hadoop Architecture&依赖 +Hadoop&master-slave topology&依赖 +architecture&three layer&依赖 +hdf&Hadoop&依赖 +Hadoop cluster&cluster&GENERALIZATION +hdf daemon namenode and yarn daemon resourcemanager&Hadoop cluster&依赖 +hdf daemon namenode and yarn daemon resourcemanager&master node&依赖 +hdf daemon datanode&hdf daemon datanode&依赖 +hdf daemon datanode&hdf daemon datanode&依赖 +hdf daemon datanode&slave node&依赖 +hdf daemon datanode&slave node&依赖 +hdf daemon datanode&slave node&依赖 +hdf daemon datanode&slave node&依赖 +hdf daemon datanode&hdf daemon datanode&依赖 +hdf daemon datanode&hdf daemon datanode&依赖 +hdf and mapreduce framework run&same set&依赖 +hdf and mapreduce framework run&same set&依赖 +hdf and mapreduce framework run&node&依赖 +same set&node&AGGREGATION +hdf and mapreduce framework run&same set&依赖 +hdf and mapreduce framework run&node&依赖 +hdf and mapreduce framework run&node&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c09e91e0f5051ce7614a158d476e6bcd3db513a --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions-relation.txt @@ -0,0 +1,646 @@ +PROFILE Sanchita Paul Birla Institute&Technology , Mesra 49 publication&AGGREGATION +https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 +content&Puneet Singh Duggal&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 +content&10 december 2016&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&Technology&依赖 +Diabetes Diagnosis View project Prediction&thunderstorm and lightning&AGGREGATION +https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 +https://www.researchgate.net/publication/311558422 Big Data Analysis&soft computing and datum mining view project puneet singh duggal birla institute&依赖 +author&publication&AGGREGATION +soft computing and datum mining view project puneet singh duggal birla institute&Technology&AGGREGATION +user&downloaded file&依赖 +user&enhancement&依赖 +enhancement&downloaded file&AGGREGATION +Technology , Mesra , Ranchi , India duggal@gmail.com Sanchita Paul Department&Computer Science & Engineering Birla Institute&AGGREGATION +We&on-demand , on-command Digital universe&依赖 +Computer Science & Engineering Birla Institute&Technology , Mesra , Ranchi , India duggal@gmail.com Sanchita Paul Department&AGGREGATION +Computer Science & Engineering Birla Institute&Technology Mesra , Ranchi , India sanchita07@gmail.com Abstract —&AGGREGATION +challenge and solutions puneet singh duggal department&Computer Science & Engineering Birla Institute&AGGREGATION +its&Volume& +datum&" Big Data "&依赖 +Most&datum&AGGREGATION +it&nature&依赖 +heterogeneity&datum&AGGREGATION +volume&Big Data&依赖 +Traditional data management , warehousing and analysis system&datum&依赖 +specific nature&Big Data&AGGREGATION +it&specific nature&依赖 +it&Big Data&依赖 +its&nature& +it&large distributed file system&依赖 +Map Reduce&efficient analysis&依赖 +efficient analysis&Big Data&AGGREGATION +Map Reduce&Big Data&依赖 +Traditional DBMS technique&Big Data&依赖 +Traditional DBMS technique&classification and clustering&依赖 +classification and clustering&Big Data&AGGREGATION +author&various method&依赖 +author&catering&依赖 +author&various method&依赖 +author&catering&依赖 +use&file indexing&AGGREGATION +Minimization technique&use&依赖 +Minimization technique&file indexing&依赖 +Minimization technique&technique&GENERALIZATION +Map Reduce technique&paper&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&click stream datum&依赖 +Keyword-Big Data Analysis&everything&依赖 +structured ( traditional dataset&DBMS table&依赖 +Big Data&e-mail attachment&依赖 +structured ( traditional dataset&DBMS table&依赖 +structured ( traditional dataset&rows and column&依赖 +Big Data&datum&依赖 +structured ( traditional dataset&rows and column&依赖 +Big Data&structured ( traditional dataset&依赖 +heterogeneous mix&datum&AGGREGATION +80 percent&enterprise datum&AGGREGATION +whose size&typical database software tool&依赖 +whose&size& +“ Big data ”&dataset&依赖 +ability&typical database software tool&AGGREGATION +big datum analyticsis&area&依赖 +advanced analytic technique&big data set&依赖 +two&most profound trend&依赖 +two&one&依赖 +one&most profound trend ( bus ) [ 4 ]&AGGREGATION +it&heterogeneity , velocity and volume&依赖 +it&Big Data&依赖 +it&1 ] [ 2 ]&依赖 +it&traditional data analysis and management tool&依赖 +heterogeneity , velocity and volume&Big Data&AGGREGATION +problem&NoSQL&依赖 +problem&NoSQL&依赖 +it&transaction processing&依赖 +analysis&Big Data&AGGREGATION +it&Parallel&依赖 +Map Reduce&[ 12 ]&依赖 +its&architecture& +shared-nothing&commodity diverse hardware ( big cluster )&依赖 +Map Reduce&characteristic&依赖 +function&high-level programming language&依赖 +Its&functions& +Hive tool&tool&GENERALIZATION +Query processing&NoSQL&依赖 +Hive tool&[ 20 ]&依赖 +what&possible solution&依赖 +more business opportunity&affinity&依赖 +more business opportunity&affinity&依赖 +best suppliers , associate product&affinity&AGGREGATION +more business opportunity&sale seasonality [ 25 ] etc&依赖 +more business opportunity&best suppliers , associate product&依赖 +more business opportunity&best suppliers , associate product&依赖 +more business opportunity&sale seasonality [ 25 ] etc&依赖 +advanced form&analytics [ 6 ]&AGGREGATION +Traditional experience online analytic processing ( olap )&analytics [ 6 ]&依赖 +Traditional experience online analytic processing ( olap )&advanced form&依赖 +Organizations&specific form&实现 +Organizations&analytic&实现 +specific form&analytic&AGGREGATION +collection&related techniques and tool type&AGGREGATION +user&new business fact&依赖 +user&knew&依赖 +large volume&datum&AGGREGATION +analyst&detail&依赖 +analyst&large volume&依赖 +plenty&detail&AGGREGATION +analyst&plenty&依赖 +enterprise&analytics example&依赖 +enterprise&log datum&依赖 +analyst&datum&依赖 +subset&customer base&AGGREGATION +analyst&historic datum&依赖 +analyst&data warehouse&依赖 +company&new form&依赖 +other product&BI&AGGREGATION +company&customer behavioural change&依赖 +new form&customer behavioural change&AGGREGATION +discovery&metric , report , analytic model&依赖 +company&customer behavioural change&依赖 +company&new form&依赖 +different type&analytic tool&AGGREGATION +unique challenge&special processing system&依赖 +unique challenge&special processing system&依赖 +Map Reduce&[&依赖 +Map Reduce&[&依赖 +analysis&technique&依赖 +distributed file system architecture&original Google File System [ 13 ]&依赖 +Map Reduce job&efficient data processingtechnique&依赖 +Mapping , Combining , Shuffling , Indexing , Grouping and reduce&[ 7 ]&依赖 +phase&MapReduce&AGGREGATION +technique&Map Reduce task&实现 +technique&paper&依赖 +technique&implementation&依赖 +a result need&index&依赖 +a result need&index&依赖 +impact&world-wide Web&AGGREGATION +major issue&world-wide Web&依赖 +major issue&impact&依赖 +major issue&impact&依赖 +major issue&world-wide Web&依赖 +its&content& +Database technology&task&依赖 +company&topology&依赖 +company&information&依赖 +company&Web and users ‟ search history&依赖 +topology&Web and users ‟ search history&AGGREGATION +turn&further challenge&依赖 +turn&millennium&AGGREGATION +google ‟&challenge&实现 +challenge&web-scale datum management and analysis&AGGREGATION +google ‟&web-scale datum management and analysis&实现 +challenge&Web-scale storage&AGGREGATION +whose content&machine&依赖 +hundred&machine&AGGREGATION +whose&content& +whose content&machine&依赖 +it&large file&依赖 +whose content&machine&依赖 +programming model&model&GENERALIZATION +Google&Map Reduce programming model and platform [ 1 ] [ 13 ]&依赖 +its&model& +sort&partitioned parallelism&AGGREGATION +Map Reduce framework&datum&依赖 +Map Reduce framework&a common key (&依赖 +large collection&datum&AGGREGATION +Map Reduce framework&large collection&依赖 +group&instance&AGGREGATION +Facebook&suit&依赖 +its&system& +Hadoop system&traction&依赖 +it&use case include web indexing&依赖 +set&higher-level declarative language&AGGREGATION +Hadoop community&set&依赖 +Hadoop community&higher-level declarative language&依赖 +low-level nature&Map Reduce programming model&AGGREGATION +Popular language&Yahoo!&依赖 +Popular language&Pig&依赖 +Jaql&ibm [ 28 ]&依赖 +Jaql&[ 18 ]&依赖 +Pig&nature&依赖 +60 %&Yahoo!&AGGREGATION +90 %&Facebook Map Reduce use case&AGGREGATION +[ 27 ]&Dryad&依赖 +[ 27 ]&cover&依赖 +[ 27 ]&Dryad&依赖 +Microsoft&Hadoop [ 24 ]&依赖 +Microsoft&support&依赖 +its&strategy& +HADOOP AND HDFS Hadoop&data storage and processing&依赖 +it&hdf&依赖 +It&commodity hardware&依赖 +It&MapReduce&依赖 +It&distributed data processing&依赖 +[ 17 ] [ 19 ]&found&依赖 +software architecture&aHadoop stack&AGGREGATION +[ 17 ] [ 19 ]&layer&依赖 +file&byte&依赖 +( very large ) contiguous and randomly addressable sequence&byte&AGGREGATION +file&distributed file system&依赖 +hdf&Hadoop software stack&依赖 +hdf&bottom&依赖 +hdf&bottom&依赖 +hdf&Hadoop software stack&依赖 +file&( very large ) contiguous and randomly addressable sequence&依赖 +bottom&Hadoop software stack&AGGREGATION +HDFS file&file&GENERALIZATION +middle layer&batch analytic&依赖 +middle layer&batch analytic&依赖 +Hadoop Map Reduce system&HDFS file&依赖 +middle layer&stack&AGGREGATION +map phase&job&AGGREGATION +Hadoop Map Reduce system&map operation&依赖 +group&output data item&AGGREGATION +Hadoop Map Reduce system&map operation&依赖 +partition&HDFS file and sort&AGGREGATION +Hadoop Map Reduce system&partition&依赖 +hbase store (&basic key-based record management operation&依赖 +hbase store (&Hadoop stack&依赖 +hbase store (&key-value layer&依赖 +hbase store (&application&依赖 +hbase store (&application&依赖 +hbase store (&key-value layer&依赖 +hbase store (&Hadoop stack&依赖 +hbase store (&basic key-based record management operation&依赖 +top&hdf )&AGGREGATION +contents&HBase&AGGREGATION +Many user&declarative language&依赖 +MapReduce programming model&programming model&GENERALIZATION +Many user&bare MapReduce programming model&依赖 +Many user&Hadoop stack&AGGREGATION +Many user&use&依赖 +use&declarative language&AGGREGATION +High-level language compiler null&Hadoop software stack&依赖 +High-level language compiler null&Hadoop software stack&依赖 +High-level language compiler null&such client&依赖 +High-level language compiler null&such client&依赖 +HDFS Clusters Figure2&traditional experience&依赖 +collection&related technique&AGGREGATION +HDFS Clusters Figure2&relevancy&依赖 +HDFS Clusters Figure2&traditional experience&依赖 +HDFS Clusters Figure2&relevancy&依赖 +Figure 3&Hadoop&实现 +Figure 3&architecture&依赖 +architecture&HDFS clusters implementation&AGGREGATION +Figure 3&HDFS clusters implementation&依赖 +hdf&task&依赖 +Data analysis task&cluster&依赖 +BIG DATA ANALYSIS Heterogeneity&progress&依赖 +BIG DATA ANALYSIS Heterogeneity&progress&依赖 +BIG DATA ANALYSIS Heterogeneity&process&依赖 +phase&datum&依赖 +BIG DATA ANALYSIS Heterogeneity&process&依赖 +BIG DATA ANALYSIS Heterogeneity&phase&依赖 +phase&value&依赖 +BIG DATA ANALYSIS Heterogeneity&phase&依赖 +phase&process&AGGREGATION +much datum today&structured format&依赖 +images and video&storage and display&依赖 +piece&text&AGGREGATION +major creator&value&AGGREGATION +value&datum&AGGREGATION +most datum&digital format today&依赖 +we&opportunity&依赖 +scalability&algorithm&AGGREGATION +Big Data analysis&many application&依赖 +complexity&datum&AGGREGATION +lack&algorithm&AGGREGATION +lack&scalability&AGGREGATION +most&statistician&依赖 +its&interpretation& +presentation&result&AGGREGATION +most&BI related job&AGGREGATION +Figure 4&big data analysis tool&依赖 +glimpse&big data analysis tool&AGGREGATION +data storage part&HDFS distributed file system architecture&依赖 +other mention architecture&amazon web service ( aws ) [ 23 ] , hbase and cloudstore etc&依赖 +HDFS distributed file system architecture&distributed file system architecture&GENERALIZATION +part&hadoop and hdfs framework&AGGREGATION +velocity and heterogeneity&datum&AGGREGATION +volume and veracity&datum&AGGREGATION +layer&bedrock&依赖 +layer&Big Data management and analysis framework&依赖 +layer&Big Data management and analysis framework&依赖 +layer&bedrock&依赖 +their&tools& +MapReduce programming model&map ( )&依赖 +MapReduce programming model&two function and map ( )&依赖 +their&logic& +user&own processing logic&实现 +list&intermediate key/value pair&AGGREGATION +map ( ) function&input key/value pair&依赖 +mapreduce runtime system group&mapreduce runtime system group&依赖 +mapreduce runtime system group&mapreduce runtime system group&依赖 +signature&map ( )&AGGREGATION +one master node&slave node&依赖 +list ( v2 )&master-slave architecture&依赖 +list ( v2 )&master-slave architecture&依赖 +number&slave node&AGGREGATION +one master node&number&依赖 +one master node&19 ]&依赖 +Hadoop&MapReduce job&依赖 +MapReduce job&job&GENERALIZATION +data block&one TaskTracker node&依赖 +TaskTracker node&JobTracker&依赖 +scheduler&new task&依赖 +scheduler&it&依赖 +it&data block&依赖 +scheduler&data locality&依赖 +scheduler&account&依赖 +Map Reduce Architecture&local data block&依赖 +Map Reduce Architecture&TaskTracker&依赖 +scheduler&TaskTracker&依赖 +scheduler&rack-local or random data block&依赖 +runtime system group&reduce task&依赖 +set&reduce task&AGGREGATION +runtime system group&set&依赖 +hundreds or thousand&processor&AGGREGATION +scalability and i&heterogeneous and large dataset&依赖 +scalability and i&inbuilt process&依赖 +inbuilt process&heterogeneous and large dataset&AGGREGATION +scalability and i&status and monitoring&依赖 +status and monitoring&heterogeneous and large dataset&AGGREGATION +scalability and i&status and monitoring&依赖 +scalability and i&heterogeneous and large dataset&依赖 +scalability and i&inbuilt process&依赖 +Node –&file&依赖 +Node –&HDFS metada&依赖 +Node –&doesn ‟ t deal&依赖 +Data Node – stores block&HDFS – default replication level&AGGREGATION +job tracker – schedule&job tracker – schedule&依赖 +Task Tracker –&Mapper and Reducer interface&实现 +core&job&AGGREGATION +1 ) mapper mapper&input key/value pair&依赖 +1 ) mapper mapper&intermediate key/value pair&依赖 +1 ) mapper mapper&set&依赖 +set&intermediate key/value pair&AGGREGATION +individual task&input record&依赖 +individual task&intermediate record&依赖 +zero or many output pair&19 ]&依赖 +block&input file&AGGREGATION +number&map&AGGREGATION +total number&block&AGGREGATION +number&input&依赖 +number&total size&依赖 +total size&input&AGGREGATION +right level¶llelism&AGGREGATION +map&execute&依赖 +map&minute&依赖 +10TB&input datum&AGGREGATION +you&input datum&依赖 +blocksize&128MB&AGGREGATION +you&10TB&依赖 +with 82,000 map&17 ] [ 19 ]&依赖 +smaller set&value&AGGREGATION +2 ) reducer reducer&intermediate value&依赖 +intermediate value&key&依赖 +intermediate value&value&依赖 +intermediate value&smaller set&依赖 +set&intermediate value&AGGREGATION +2 ) reducer reducer&set&依赖 +Reducer&3 primary phase&依赖 +Reducer&shuffle&依赖 +2.1 ) shuffle input&mapper&依赖 +2.1 ) shuffle input&mapper&依赖 +sorted output&mapper&AGGREGATION +framework&HTTP&依赖 +framework&relevant partition&依赖 +framework&HTTP&依赖 +output&mapper&AGGREGATION +framework&output&依赖 +relevant partition&output&AGGREGATION +framework&relevant partition&依赖 +framework&output&依赖 +framework group&key&依赖 +framework group&have&依赖 +framework group&reducer input&依赖 +one&a comparator ( secondary sort )&依赖 +( list&value&AGGREGATION +grouped inputs.The output&reduce task&AGGREGATION +application&Reporter&依赖 +output&Reducer&AGGREGATION +right number&reduce&AGGREGATION +better job&load balancing [ MR Framework ]&AGGREGATION +their&round& +faster node&reduce&依赖 +faster node&reduce&依赖 +faster node&first round&依赖 +faster node&first round&依赖 +first round&reduce&AGGREGATION +number&reduce&AGGREGATION +cost&failure&AGGREGATION +scaling factor&a few reduce slot&依赖 +scaling factor&speculative-task&依赖 +It&number&依赖 +number&reduce-task&AGGREGATION +It&reduce-task&依赖 +a ) partitioner partitioner partition&key space&依赖 +Partitioner&key&依赖 +key&intermediate map-output&AGGREGATION +Partitioner&intermediate map-output&依赖 +partitioning&key&AGGREGATION +Partitioner&partitioning&依赖 +subset&key )&AGGREGATION +number&job&依赖 +number&reduce task&依赖 +number&of&AGGREGATION +total number&partition&AGGREGATION +this control&task&依赖 +intermediate key (&for reduction&依赖 +b ) reporter reporter&MapReduce application&依赖 +counters.mapper and reducer implementation&Reporter&依赖 +counters.mapper and reducer implementation&progress&依赖 +application&time&依赖 +significant amount&time&AGGREGATION +application&significant amount&依赖 +framework&task&依赖 +application&counter&依赖 +application&Reporter&依赖 +c ) output collector output collector&facility&依赖 +MapReduce framework&framework&GENERALIZATION +name node –&HDFS metada&依赖 +generalization&facility&AGGREGATION +RGPV 274 output&job )&AGGREGATION +library&useful mapper&AGGREGATION +amount&intermediate datum&AGGREGATION +They&" mini-reducer&依赖 +" mini-reducer&mapper&依赖 +" mini-reducer&output&依赖 +combiner&term& +result&collection&依赖 +result&term&依赖 +result&order&依赖 +result&order&依赖 +result&total number&依赖 +result&collection&依赖 +result&network&依赖 +result&collection&依赖 +number&intermediate key-value pair&AGGREGATION +result&total number&依赖 +result&network&依赖 +result&term&依赖 +result&total number&依赖 +result&term&依赖 +result&collection&依赖 +total number&term&AGGREGATION +result&term&依赖 +result&network&依赖 +order&total number&AGGREGATION +result&order&依赖 +order&number&AGGREGATION +number&unique term&AGGREGATION +result&network&依赖 +result&order&依赖 +result&total number&依赖 +They&result size&依赖 +machine&shuffling cost&依赖 +result size&map function&AGGREGATION +They&map function&依赖 +keyword&technique&依赖 +they&document key&依赖 +keyword&which&依赖 +keyword&document&AGGREGATION +keyword&document key&依赖 +they&which&依赖 +> doc4 :24 shuffling shuffling&IMF , Financial Economics Crisis Doc2&依赖 +index&file&AGGREGATION +their&keys& +> doc4 :24 shuffling shuffling&example Doc1&依赖 +harry potter crisis follow&above data IMF&依赖 +inverted index&above data IMF&AGGREGATION +heterogeneous mix&dataset&AGGREGATION +better chance&accurate result&依赖 +We&population&依赖 +We&generating&依赖 +We&shuffling process&依赖 +process&nature&依赖 +their&purpose& +Cartesian product&datum&AGGREGATION +datum&possible combination&AGGREGATION +its&techniques& +Map Reduce&own Join technique&依赖 +it&Map Reduce&依赖 +it&means&依赖 +iterative work&partitioning&依赖 +iterative work&datum&依赖 +iterative work&datum&依赖 +partitioning&datum&AGGREGATION +iterative work&partitioning&依赖 +data sort&clustering&依赖 +new centre&Step 8&依赖 +new centre&Repeat 1-7&依赖 +their&Step7& +one&k centre&AGGREGATION +new centre&Step 8&依赖 +all datum point¢re&依赖 +Input&k centre&依赖 +new centre&Repeat 1-7&依赖 +new centre&Repeat 1-7&依赖 +new centre&Step 8&依赖 +process enormous quantity&datum&AGGREGATION +dizzying array&source&AGGREGATION +organization&customer&依赖 +competitive advantage&6 ]&依赖 +their&customers& +large and heterogeneous dataset&RGPV 275&依赖 +large and heterogeneous dataset&RGPV 275&依赖 +large and heterogeneous dataset&continuous flow&依赖 +large and heterogeneous dataset&Nov 13-15&依赖 +large and heterogeneous dataset&RGPV 275&依赖 +large and heterogeneous dataset&Nov 13-15&依赖 +engineer&information processing tools and application&依赖 +continuous flow&datum&AGGREGATION +large and heterogeneous dataset&RGPV 275&依赖 +large and heterogeneous dataset&datum&依赖 +large and heterogeneous dataset&Nov 13-15&依赖 +large and heterogeneous dataset&Nov 13-15&依赖 +wide range&task&AGGREGATION +large and heterogeneous dataset&Nov 13-15&依赖 +large and heterogeneous dataset&RGPV 275&依赖 +massive amount&datum&AGGREGATION +mystery&life&AGGREGATION +secret&cosmos&AGGREGATION +variety&problem&AGGREGATION +tool&task&依赖 +single opportunity&map&依赖 +many example&algorithm&AGGREGATION +them&barrier&实现 +single opportunity&map&依赖 +phase&processing )&AGGREGATION +them&map&实现 +existence&shared global state&AGGREGATION +them&mapreduce (&实现 +single opportunity&map&依赖 +model parameter&shared global state&依赖 +model&training datum&依赖 +process&access&依赖 +process&access&依赖 +process&state&依赖 +process&state&依赖 +process&access&依赖 +process&state&依赖 +synchronization&MapReduce framework&依赖 +synchronization&resource&AGGREGATION +update&one or more reducer&依赖 +synchronization&batch learner&依赖 +update&driver code )&依赖 +smaller number&instance&AGGREGATION +design choice&most existing MapReduce implementation&AGGREGATION +faster processing&smaller dataset&AGGREGATION +style&insufficient use&依赖 +style&insufficient use&依赖 +MapReduce&batch operation&依赖 +MapReduce&datum&依赖 +MapReduce&large amount&依赖 +insufficient use&resource&AGGREGATION +large amount&datum&AGGREGATION +style&computation&AGGREGATION +style&resource&依赖 +style&resource&依赖 +ADVANCEMENTS stream&dealing&依赖 +ADVANCEMENTS stream&alternative programming model&依赖 +one or more stream&input&AGGREGATION +its&design& +Pregel [ 16 ]&programming model&实现 +Valiant&model& +Pregel&large-scale graph algorithm&依赖 +Pig [ 28 ]&data analytics platform&依赖 +Pig script&join&依赖 +Pig script&execution engine&依赖 +Pig script&Hadoop job&依赖 +Pig&engine& +open-source project&user&依赖 +open-source project&large relational dataset&依赖 +open-source project&SQL query&依赖 +top&Hadoop&AGGREGATION +advantage&datum processing capability&AGGREGATION +Hadoop&capabilities& +user&abstraction&AGGREGATION +power&MapReduce&AGGREGATION +power&large cluster&AGGREGATION +development&alternative approach&AGGREGATION +MapReduce&Hadoop/HDFS/MapReduceecosystem&依赖 +MapReduce&generalization&依赖 +MapReduce&Hadoop/HDFS/MapReduceecosystem&依赖 +paper&Map Reduce task&依赖 +join processing mention&n&依赖 +join processing mention&n&依赖 +drawback&present system&AGGREGATION +future direction&traditional datum analysis tool&依赖 +future direction&traditional datum analysis tool&依赖 +paradigm&HDFS and Hadoop&AGGREGATION +1 ] jefry dean and MapReduce&1 ] jefry dean and MapReduce&依赖 +A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 +A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 +A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 +A Flexible Data Processing Tool and Communications and Volume 53&pp 72-77&依赖 +Communications&ACM&AGGREGATION +[ 2 ] jefry dean&[ 2 ] jefry dean&依赖 +Communications&ACM , Volume 51 pp.&AGGREGATION +you&era&依赖 +you&„ big data ‟&依赖 +era&„ big data ‟&AGGREGATION +University&Houston&AGGREGATION +Comparison&Join Algorithms&AGGREGATION +13 ] S. Ghemawat&Google File System&依赖 +[ 16 ] grzegorzmalewicz&pp 135-145&依赖 +[ 16 ] grzegorzmalewicz&pp 135-145&依赖 +[ 16 ] grzegorzmalewicz&pp 135-145&依赖 +[ 16 ] grzegorzmalewicz&pp 135-145&依赖 +[ 16 ] grzegorzmalewicz&pp 135-145&依赖 +/ / www.microsoft.com/windowsazure/features/storage/ [ 25 ] The Age&Big Data&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..916b72dd1136ac5988e6f9948a37bcb8ee509848 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks-relation.txt @@ -0,0 +1,4 @@ +copyright � 2018 elsevier b.v.©right � 2018 elsevier b.v.&依赖 +its&licensors& +registered trademark&elsevier b.v. term and condition&AGGREGATION +ScienceDirect �&elsevier b.v. term and condition&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d5debe8d6d827c54c7c77a01fb8ca31c3065f4d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce-relation.txt @@ -0,0 +1,176 @@ +huge amount&datum&AGGREGATION +we&application&依赖 +we&which&依赖 +we&large cluster&依赖 +we&huge amount&依赖 +large cluster&commodity hardware&AGGREGATION +we&reliable manner&依赖 +MapReduce algorithm&two important task&依赖 +MapReduce algorithm&namely map&依赖 +Map&set&依赖 +set&datum&AGGREGATION +Map&datum&依赖 +smaller set&tuple&AGGREGATION +sequence&name MapReduce&AGGREGATION +major advantage&MapReduce&AGGREGATION +thousand&machine&AGGREGATION +we&application&依赖 +we&MapReduce form&依赖 +simple scalability&many programmer&依赖 +simple scalability&MapReduce model&依赖 +MapReduce program&three stage&依赖 +MapReduce program&program&GENERALIZATION +form&file or directory&AGGREGATION +input datum&file or directory&依赖 +input file&file&GENERALIZATION +input file&line&依赖 +input file&mapper function line&依赖 +mapper&datum&依赖 +several small chunk&datum&AGGREGATION +combination&Shuffle stage&AGGREGATION +Reducer ’s job&datum&依赖 +it&new set&依赖 +it&output&依赖 +it&new set&依赖 +it&output&依赖 +new set&output&AGGREGATION +Hadoop&appropriate server&依赖 +Hadoop&Map and Reduce task&依赖 +Hadoop&Map and Reduce task&依赖 +Hadoop&cluster&依赖 +framework&detail&依赖 +detail&data-passing such as&AGGREGATION +framework&data-passing such as&依赖 +framework&task&依赖 +local disk&network traffic&依赖 +Most&computing&AGGREGATION +Most&place&依赖 +completion&given task&AGGREGATION +cluster&given task&依赖 +cluster&completion&依赖 +set&< key , value > pair&AGGREGATION +job&different type&AGGREGATION +MapReduce framework&framework&GENERALIZATION +framework&input&依赖 +MapReduce framework&< key&依赖 +framework&set&依赖 +framework&input&依赖 +output&job&AGGREGATION +framework&< key , value > pair&依赖 +key&framework&依赖 +key class&Writable-Comparable interface&实现 +Input and Output type&a mapreduce job − ( input )&AGGREGATION +Input and Output type&v3 > ( output )&依赖 +Input and Output type&v2 > → reduce → < k3&依赖 +core&job&AGGREGATION +Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 +Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 +Input Output Map < k1 and v1 > list ( and < k2 , v2 > ) reduce&Map&实现 +Mapper − Mapper&set&依赖 +Mapper − Mapper&input key/value pair&依赖 +set&intermediate key/value pair&AGGREGATION +Mapper − Mapper&intermediate key/value pair&依赖 +namednode − node&file system ( hdf )&依赖 +datum&advance&依赖 +datanode − node&datanode − node&依赖 +processing&place&依赖 +masternode − node&masternode − node&依赖 +slavenode − node&slavenode − node&依赖 +jobtracker − schedules job&jobtracker − schedules job&依赖 +Task Tracker −&task and report status&依赖 +Task Tracker −&JobTracker&依赖 +program&dataset&依赖 +program&dataset&依赖 +program&Mapper and Reducer&依赖 +execution&Mapper and Reducer&AGGREGATION +task −&Mapper&依赖 +task −&Mapper&依赖 +task −&execution&依赖 +task −&Mapper&依赖 +task −&Mapper&依赖 +task −&execution&依赖 +task −&execution&依赖 +task −&execution&依赖 +task −&execution&依赖 +task −&Mapper&依赖 +task −&execution&依赖 +task −&Mapper&依赖 +task −&execution&依赖 +task −&execution&依赖 +task −&Mapper&依赖 +task −&execution&依赖 +task −&Mapper&依赖 +task −&Mapper&依赖 +execution&Mapper&AGGREGATION +slice&datum&AGGREGATION +task attempt −&particular instance&依赖 +task attempt −&an attempt&AGGREGATION +electrical consumption&organization&AGGREGATION +It&monthly electrical consumption&依赖 +we&application&依赖 +above datum&input&依赖 +year&maximum usage and year&AGGREGATION +year&minimum usage&AGGREGATION +finite number&record&AGGREGATION +They&required output&依赖 +They&logic&依赖 +electrical consumption&largescale industry&AGGREGATION +largescale industry&particular state&AGGREGATION +its&formation& +we&such bulk datum&依赖 +They&time&依赖 +They&lot&依赖 +lot&time&AGGREGATION +we&datum&依赖 +we&source&依赖 +we&network server&依赖 +we&MapReduce framework&依赖 +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&MapReduce framework&依赖 +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&sample datum&依赖 +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&sample datum&依赖 +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 1980 26 27 28 28 28 30 31 31 31 30 30 30 29 1981 31 32 32 32 33 34 35 36 36 34 34 34 34 1984 39 38 39 39 39 41 42 43 40 39 38 38 40 1985 38 39 39 39 39 41 41 41 00 40 39 39 45 Example Program&MapReduce framework&依赖 +/ / Reducer class public static class E_EReduce extends MapReduceBase&IntWritable > {&实现 +/ / Reducer class public static class E_EReduce extends MapReduceBase&Reducer < Text and IntWritable and Text&实现 +compilation and execution&program&AGGREGATION +home directory&Hadoop user&AGGREGATION +Compilation and Execution&Process Units Program&AGGREGATION +we&Hadoop user&依赖 +we&home/hadoop )&依赖 +Step 1&directory&依赖 +Step 1&compiled java class&依赖 +Step 1&compiled java class&依赖 +Step 1&directory&依赖 +$ mkdir unit&2 Download Hadoop-core-1.2.1.jar&依赖 +follow link mvnrepository.com&jar&依赖 +input_dir step&5&依赖 +input directory&hdf&AGGREGATION +$ HADOOP_HOME / bin/hadoop jar units.jar hadoop.ProcessUnits&while&依赖 +$ HADOOP_HOME / bin/hadoop jar units.jar hadoop.ProcessUnits&Wait&依赖 +output&number&依赖 +output&number&依赖 +output&input split&依赖 +number&input split&AGGREGATION +number&Map task&AGGREGATION +output&input split&依赖 +number&reducer task&AGGREGATION +FILE&large read operation&AGGREGATION +Number&write operation&AGGREGATION +Number&byte&AGGREGATION +Number&read operation&AGGREGATION +Number&large read operation&AGGREGATION +File&Counters& +file&hdf&依赖 +Hadoop command&$ HADOOP_HOME / bin/hadoop command&依赖 +cat output_dir / part-00000 / bin/hadoop df&output_dir / home/hadoop Important command&依赖 +table&option&依赖 +their&description& +20 distcp &20 distcp &依赖 +class path&Hadoop jar&依赖 +Hadoop jar&jar&GENERALIZATION +events&details& +- list&job&依赖 +Killed task&failed attempt&依赖 +Failed task&failed attempt&依赖 +priority&job&AGGREGATION +history&bin/hadoop job&AGGREGATION +status&bin/hadoop job&AGGREGATION diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" new file mode 100644 index 0000000000000000000000000000000000000000..739e1aa773dbd933acd360bdc4007a51b8fcf466 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce-relation.txt" @@ -0,0 +1,346 @@ +Hadoop Architecture&Big Data Course !!&依赖 +your&career& +Hadoop Architecture&Big Data Course !!&依赖 +design&Hadoop&AGGREGATION +design&various goal&依赖 +handling&large dataset&AGGREGATION +we&blog&依赖 +we&Hadoop Architecture&依赖 +we&detail&依赖 +we&Hadoop Architecture Diagram&依赖 +’s&Hadoop Architecture&依赖 +master-slave topology&topology&GENERALIZATION +Hadoop&master-slave topology&依赖 +we&one master node&依赖 +we&topology&依赖 +’s function&task&依赖 +’s function&various slave node&依赖 +node&function& +slave node&actual computing&依赖 +Slave node&real datum&依赖 +we&master&依赖 +metadata&what&依赖 +Hadoop Architecture&three major layer&依赖 +hdf ( hadoop&file system ) yarn mapreduce 1&依赖 +hdf ( hadoop&file system ) yarn mapreduce 1&依赖 +HDFS hdf&Hadoop Distributed File System&依赖 +data storage&Hadoop&AGGREGATION +hdf&data unit&依赖 +hdf&smaller unit&依赖 +It&two daemons run&依赖 +namenode and datanode hdfs&Master-slave architecture&依赖 +daemon&master server&依赖 +daemon&master server&依赖 +It&Namespace management&依赖 +DataNode daemon&slave node&依赖 +DataNode daemon&daemon&GENERALIZATION +file&data block&依赖 +file&number&依赖 +number&data block&AGGREGATION +group&slave machine&AGGREGATION +Namenode&system namespace&依赖 +Namenode&modification&依赖 +opening&files or directory&依赖 +NameNode&track&依赖 +NameNode&DataNodes&依赖 +mapping&block&AGGREGATION +NameNode&mapping&依赖 +track&mapping&AGGREGATION +DataNodes&read/write request&依赖 +DataNodes&file system ’s client&依赖 +DataNode&NameNode&依赖 +DataNode&delete&依赖 +DataNode&demand&依赖 +native language&hdf&AGGREGATION +Java&hdf&依赖 +one&machine&依赖 +one&DataNode and NameNode&依赖 +one&having&依赖 +one dedicated machine&typical deployment&依赖 +one dedicated machine&typical deployment&依赖 +other node&cluster run datanode&依赖 +other node&cluster run datanode&依赖 +NameNode&metada&依赖 +location&block&AGGREGATION +NameNode&metada&依赖 +NameNode&block&依赖 +NameNode&DataNodes&依赖 +You&hadoop high availability concept&依赖 +You&hadoop high availability concept&依赖 +smallest unit&storage&AGGREGATION +default block size&block size&GENERALIZATION +block size&size&GENERALIZATION +we&128MB&依赖 +we&default block size&依赖 +we&default block size&依赖 +we&128MB&依赖 +default block size&128MB&AGGREGATION +One&block size&依赖 +us&example&依赖 +us&file&依赖 +example&file&AGGREGATION +128mb then hdf&6 block&依赖 +our&size& +128mb then hdf&file&依赖 +128MB and one block&60MB&AGGREGATION +Five block&128MB and one block&AGGREGATION +we&size&依赖 +we&size&依赖 +we&file&依赖 +we&file&依赖 +file&size&AGGREGATION +we&file&依赖 +we&file&依赖 +we&size&依赖 +we&size&依赖 +we&numerous block&依赖 +4kb&block size&AGGREGATION +huge metada&NameNode&依赖 +Replication Management&replication technique&依赖 +copy&block and store&AGGREGATION +it©&依赖 +it&block and store&依赖 +it&block and store&依赖 +it©&依赖 +it&different datanode&依赖 +it&different datanode&依赖 +how many copy&block&AGGREGATION +we&value&依赖 +It&default&依赖 +file&1GB&AGGREGATION +we&file&依赖 +we&1GB&依赖 +replication factor&3&AGGREGATION +it&3gb&依赖 +3gb&total storage&AGGREGATION +it&total storage&依赖 +NameNode&block report&依赖 +NameNode&DataNode&依赖 +NameNode&replica&依赖 +rack&many DataNode machine&依赖 +hdf&block&依赖 +replica&block&AGGREGATION +hdf&replica&依赖 +hdf&rack awareness algorithm&依赖 +hdf&distributed fashion&依赖 +rack awareness algorithm&local rack&依赖 +rack awareness algorithm&first block&依赖 +It&more than two block&依赖 +It&same rack&依赖 +It&possible&依赖 +MapReduce MapReduce&Hadoop&依赖 +MapReduce MapReduce&MapReduce&GENERALIZATION +data processing layer&Hadoop&AGGREGATION +large amount&datum&AGGREGATION +MapReduce&cluster&依赖 +cluster&low-end machine&AGGREGATION +MapReduce&low-end machine&依赖 +MapReduce&application&依赖 +MapReduce&application&依赖 +It&reliable and fault-tolerant manner&依赖 +number&map task&AGGREGATION +MapReduce job&map task&依赖 +MapReduce job&number&依赖 +task&part&依赖 +part&datum&AGGREGATION +task&datum&依赖 +function&transform and filter datum&依赖 +function&Map task&AGGREGATION +sub-set&output&AGGREGATION +Reduce task&intermediate datum&依赖 +Reduce task&aggregation&依赖 +Reduce task&map task&依赖 +input file&hdf&依赖 +input file&file&GENERALIZATION +MapReduce job&job&GENERALIZATION +input file&hdf&依赖 +inputformat&input file&依赖 +byte-oriented view&chunk&AGGREGATION +chunk&input file&AGGREGATION +input split&map task&依赖 +map task&task&GENERALIZATION +map task&node&依赖 +RecordReader The recordreader&record&依赖 +RecordReader The recordreader&input split&依赖 +It&datum&依赖 +It&record&依赖 +mapper function&function&GENERALIZATION +datum&record&依赖 +mapper&phase&依赖 +mapper&key-value pair&依赖 +mapper&recordreader&依赖 +It&zero or multiple intermediate key-value pair&依赖 +decision&mapper function&依赖 +decision&mapper function&依赖 +reducer function&datum&依赖 +reducer function&operation&依赖 +reducer function&function&GENERALIZATION +Combiner&intermediate datum&依赖 +Combiner&mapper&依赖 +It&one mapper&依赖 +small scope&one mapper&AGGREGATION +It&small scope&依赖 +amount&datum&AGGREGATION +1 ) three time&more network bandwidth&依赖 +1 ) three time&example&依赖 +Partitioner Partitioner&intermediate key-value pair&依赖 +Partitioner Partitioner&mapper&依赖 +It&one shard&依赖 +It&them&依赖 +It&reducer&依赖 +It&them&依赖 +partitioner&hashcode&依赖 +hashcode&key&AGGREGATION +partitioner&key&依赖 +partitioner&hashcode&依赖 +partitioner&key&依赖 +partitioner&modulus operation&依赖 +partitioner&reducer )&依赖 +number&reducer&AGGREGATION +key.hashcode ( ) % ( number&reducer )&AGGREGATION +partitioner&modulus operation&依赖 +partitioner&key.hashcode ( ) % ( number&依赖 +partitioned datum&local file system&依赖 +partitioned datum&map task&依赖 +reducer&it&依赖 +reducer&shuffle and sort step&依赖 +this step download&written&依赖 +this step download&datum&依赖 +this step sort&individual data piece&依赖 +this step sort&individual data piece&依赖 +this step sort&large data list&依赖 +this step sort&large data list&依赖 +purpose&sort&AGGREGATION +purpose&equivalent key&依赖 +we&it&依赖 +framework&everything&依赖 +key&comparator object&依赖 +developer&control&依赖 +reducer&function&依赖 +reducer&key grouping&依赖 +framework&function key&依赖 +function key&key&GENERALIZATION +number&different way&AGGREGATION +We&reducer&依赖 +it&zero or more key-value pair&依赖 +it&solution&依赖 +core logic&solution&AGGREGATION +It&key-value pair&依赖 +It&reducer&依赖 +it&key and value&依赖 +it&tab&依赖 +it&tab&依赖 +it&key and value&依赖 +We&it&依赖 +YARN YARN&Hadoop&依赖 +YARN YARN&YARN&GENERALIZATION +resource management layer&Hadoop&AGGREGATION +basic principle&resource management and job scheduling/monitoring function&依赖 +basic principle&resource management and job scheduling/monitoring function&依赖 +one global ResourceManager and per-application ApplicationMaster&YARN&依赖 +Application&job&依赖 +single job&job&AGGREGATION +we&YARN framework&依赖 +YARN framework&framework&GENERALIZATION +we&two daemon resourcemanager and nodemanager&依赖 +ResourceManager&system&依赖 +ResourceManager&application&依赖 +ResourceManager&resource&依赖 +job&container&依赖 +job&resource usage&依赖 +job&NodeManger&AGGREGATION +ApplcationMaster&ResourceManager&依赖 +ApplcationMaster&resource&依赖 +ResourceManger&Scheduler&依赖 +ResourceManger&two important component&依赖 +it&tracking&依赖 +tracking&status&AGGREGATION +it&application&依赖 +it&tracking&依赖 +it&application&依赖 +It&task&依赖 +scheduler&resource&依赖 +requirement&application&AGGREGATION +function&ApplicationManager&AGGREGATION +Application Manager follow&ApplicationManager&依赖 +container&CPU , memory , disk , and network&依赖 +container&element&依赖 +function&ApplicationMaster&AGGREGATION +monitor progress&application&AGGREGATION +We&YARN&依赖 +We&YARN Federation feature&依赖 +We&few thousand node&依赖 +feature&multiple YARN cluster&依赖 +feature&us&依赖 +feature&Yarn YARN&AGGREGATION +feature&features :&依赖 +YARN&access engine&依赖 +variety&access engine&AGGREGATION +YARN&open-source or propriety )&依赖 +YARN&variety&依赖 +YARN&resource&依赖 +YARN&cluster&依赖 +YARN&dynamic allocation&依赖 +YARN&good use&依赖 +dynamic allocation&resource&AGGREGATION +good use&cluster&AGGREGATION +previous version&Hadoop&AGGREGATION +lesser utilization&cluster&AGGREGATION +YARN ’s ResourceManager&ever-expanding cluster&依赖 +petabyte&datum&AGGREGATION +YARN ’s ResourceManager&scheduling and cope&依赖 +YARN ’s ResourceManager&petabyte&依赖 +YARN ’s ResourceManager&datum&依赖 +MapReduce program&YARN&依赖 +MapReduce program&YARN&依赖 +people&idea&依赖 +people&Hadoop&依赖 +Hadoop&cheap storage and deep datum analysis&依赖 +this use jbod&Disk&依赖 +this use jbod&a bunch&依赖 +their&complexity& +Start Small and Keep Focus Many project&complexity and expense&依赖 +small cluster&node&AGGREGATION +infrastructure and development guy&internal working&依赖 +infrastructure and development guy&Hadoop&依赖 +internal working&Hadoop&AGGREGATION +Data Integration One&feature&AGGREGATION +feature&Hadoop&AGGREGATION +we&data structure&依赖 +We&flume and sqoop&依赖 +We&tool&依赖 +We&datum&依赖 +it&data integration process&依赖 +proper documentation&data source&AGGREGATION +they&cluster&依赖 +Use Compression Technique Enterprise&compression&依赖 +Use Compression Technique Enterprise&love-hate relationship&依赖 +it&performance&依赖 +compression&storage&依赖 +Hadoop&compression&依赖 +It&storage usage&依赖 +It&80 %&依赖 +different project&different requirement&依赖 +Apache Hadoop&Hadoop&GENERALIZATION +Apache Hadoop&wide ecosystem&依赖 +different project&different requirement&依赖 +it&itself&依赖 +design&Hadoop Architecture&AGGREGATION +Its&structure& +We&linearly&依赖 +MapReduce part&principle&依赖 +MapReduce part&principle&依赖 +principle&data locality&AGGREGATION +MapReduce part&design&AGGREGATION +MapReduce part&part&GENERALIZATION +MapReduce part&data locality&依赖 +MapReduce part&data locality&依赖 +Map-Reduce framework&datum&依赖 +Map-Reduce framework&computation close&依赖 +network traffic&major bandwidth&依赖 +overall architecture&Hadoop&AGGREGATION +your&Interview& +Hadoop Architecture&Hadoop Interview&依赖 +We&you&依赖 +You&Hadoop Architecture&依赖 +You&many question&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..323f777eb57a0e0f4b63e76fcbe42eb58c4cd3fc --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data-relation.txt @@ -0,0 +1,16 @@ +highest unit&work&AGGREGATION +MapReduce programming paradigm&Map Stage&依赖 +MapReduce programming paradigm&two-step data analysis process&依赖 +map stage&set&依赖 +set&datum&AGGREGATION +map stage&datum&依赖 +output&map function&AGGREGATION +Reduce job&output&依赖 +Reduce job&map function&依赖 +smaller set&tuple&AGGREGATION +reduce job&sequence&依赖 +sequence&name MapReduce&AGGREGATION +job&several mappers and reducer&依赖 +portion&task&依赖 +portion&job&AGGREGATION +slice&datum&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..57cac498c18d1631d83e4d12d33a1177b7ebbb26 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1-relation.txt @@ -0,0 +1,107 @@ +Introduction&large set&依赖 +Introduction&datum&依赖 +large set&datum&AGGREGATION +Google&that&依赖 +MapReduce&picture&依赖 +huge chunk&datum&AGGREGATION +huge amount&datum&AGGREGATION +parallel processing&huge amount&AGGREGATION +Map Reduce program&programmer&依赖 +they&MapReduce&依赖 +development&applications and deployment&AGGREGATION +development&programmer&依赖 +flow pattern&MapReduce&AGGREGATION +they&flow pattern&依赖 +Explanation&Python and C++&依赖 +Explanation&MapReduce Architecture Hadoop&AGGREGATION +Explanation&programming language&依赖 +application&software processing huge amount&AGGREGATION +software processing huge amount&datum&AGGREGATION +framework&task&依赖 +chunk&datum&AGGREGATION +framework&chunk&依赖 +a file-system store&work and input&依赖 +work and input&job&AGGREGATION +a file-system store&job&依赖 +Re-execution&framework&依赖 +Re-execution&framework&依赖 +task&framework&AGGREGATION +Re-execution&failed task&AGGREGATION +architecture&two main processing stage&依赖 +architecture&MapReduce&AGGREGATION +MapReduce&Job tracker&依赖 +Intermediate process&place&依赖 +local file system store&intermediate datum&依赖 +take&other datum&依赖 +certain number&output&AGGREGATION +take&it&依赖 +take&set&依赖 +breakdown&individual element&AGGREGATION +take&it&依赖 +take&set&依赖 +take&other datum&依赖 +set&other datum&AGGREGATION +Mappers output&reduction&依赖 +single mapper&reduced function&依赖 +new output value&hdf&依赖 +MapReduce architecture&architecture&GENERALIZATION +MapReduce Architecture Components Below&component&依赖 +component&MapReduce architecture&AGGREGATION +MapReduce Architecture Components Below&MapReduce architecture&依赖 +explanation&component&AGGREGATION +Map Phase Map phase&two part&依赖 +Map Phase Map phase&input datum&依赖 +Value&processing stage&依赖 +Let ’s&input datum&依赖 +Key-value pair conversion&record reader&依赖 +Key-value pair conversion&input datum&依赖 +piece&data format and code&AGGREGATION +reducer code place input&reducer code place input&依赖 +reducer code place input&combiner&依赖 +reducer code place input&reducer code place input&依赖 +reducer code place input&combiner&依赖 +partition module&key role&依赖 +partition module&Hadoop&依赖 +map input&sort and shuffle phase&依赖 +intermediate datum&local file system&依赖 +Hadoop node&replication&依赖 +Reducer Phase&data input&依赖 +Reducer Phase&data input&依赖 +reducer&searching&依赖 +number&reducer&AGGREGATION +speculative execution&job processing&依赖 +speculative execution&prominent role&依赖 +task&run&依赖 +more than one mapper&similar datum&依赖 +task&next mapper&依赖 +task&fast program&依赖 +| Verifiable Certificate&Access&AGGREGATION +job&two component&依赖 +job&split&依赖 +job&Map task&依赖 +complete execution&given job&AGGREGATION +Conclusion&document&依赖 +lot&document&AGGREGATION +Conclusion&lot&依赖 +you&number&依赖 +number&word&AGGREGATION +occurrence&word&AGGREGATION +number&occurrence&AGGREGATION +you&word&依赖 +you&lot&依赖 +lot&web page&AGGREGATION +you&web page&依赖 +them&search query&依赖 +I&arbitrary task&依赖 +reducer&datum&依赖 +reducer&aggregation&依赖 +it&key&依赖 +aggregation&datum&AGGREGATION +Recommended Articles This&MapReduce Architecture&依赖 +component&architecture&AGGREGATION +we&explanation&依赖 +we&MapReduce Architecture&依赖 +we&component&依赖 +we&introduction&依赖 +You&more –&依赖 +our&articles& diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..4afd378239a129f057a052ebbc7c28e787d16417 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2-relation.txt @@ -0,0 +1,117 @@ +a programming model and expectation&Hadoop&依赖 +underlying system&care&依赖 +Input&multiple chunks/blocks&依赖 +chunk/block&different node&依赖 +chunk/block&datum&AGGREGATION +MapReduce architecture&phase&依赖 +input files inputformat inputsplit recordreader mapper combiner partitioner shuffling&Reducer RecordWriter OutputFormat Input Files&依赖 +input datum&input file&依赖 +input file&hdf ( hadoop&依赖 +input file&file system )&依赖 +format&file&AGGREGATION +InputFormat - InputFormat&Map-Reduce job&依赖 +InputFormat - InputFormat&input-specification&依赖 +InputFormat&file or other object&依赖 +InputFormat&InputSplit&依赖 +InputFormat&selected input file&依赖 +InputFormat&byte&依赖 +InputFormat&input&依赖 +InputFormat&input file&依赖 +InputFormat&input&依赖 +byte&input file&AGGREGATION +InputSplit - InputSplit&InputFormat&依赖 +InputSplit&datum&依赖 +number&InputSplits&依赖 +number&map task&AGGREGATION +number&InputSplits&AGGREGATION +number&number&依赖 +number&InputSplits&依赖 +number&number&依赖 +record&specific mapper&依赖 +InputSplit&input record&依赖 +InputSplit&input&依赖 +InputSplit&byte-oriented view&依赖 +RecordReader - RecordReader&InputSplit&依赖 +RecordReader - RecordReader&Hadoop MapReduce&依赖 +RecordReader&InputSplit&依赖 +RecordReader&< key , value > pair&依赖 +RecordReader&InputSplit&依赖 +byte-oriented view&input&AGGREGATION +RecordReader&byte-oriented view&依赖 +RecordReader&input&依赖 +record-oriented view&input datum&AGGREGATION +RecordReader&datum&依赖 +RecordReader&key-value pair&依赖 +RecordReader&InputSplit&依赖 +key-value pair&further processing&依赖 +key-value pair&mapper&依赖 +Mapper - Mapper&input record&依赖 +input record&record&GENERALIZATION +mapper output&as intermediate output&依赖 +mapper output&local disk&依赖 +mapper output&output&GENERALIZATION +it&unnecessary copy&依赖 +Mappers output&combiner&依赖 +Mappers output&further process&依赖 +Mappers output&output&GENERALIZATION +Map&set&依赖 +set&datum&AGGREGATION +Map&datum&依赖 +Mapper&datum&依赖 +form&key/value pair&AGGREGATION +Mapper&key/value pair&依赖 +Mapper&form&依赖 +Combiner&map task&依赖 +output&map task&AGGREGATION +Combiner&output&依赖 +combiner&local reducer&依赖 +Hadoop&combiner function&依赖 +Hadoop&map output&依赖 +Hadoop&one or many time&依赖 +map output&output&GENERALIZATION +how output&reducer&依赖 +how output&reducer&依赖 +Partitioner&keys partition&依赖 +Partitioner&intermediate map-output&依赖 +keys partition&intermediate map-output&AGGREGATION +key&key&AGGREGATION +number&job&依赖 +number&reduce task&依赖 +number&of&AGGREGATION +total number&partition&AGGREGATION +its&execution& +Partitioner&same machine&依赖 +mapper&execution&依赖 +partitioner form number&partitioner form number&依赖 +partitioner form number&reduce task group&依赖 +partitioner form number&reduce task group&AGGREGATION +Hadoop framework&hash base partitioner&依赖 +Hadoop framework&default&依赖 +Hadoop framework&default&依赖 +hash partitioner partition&key space&依赖 +hash partitioner partition&key space&依赖 +output&partitioner&AGGREGATION +physical movement&datum&AGGREGATION +shuffling&network&依赖 +shuffling&datum&依赖 +mapper&process&依赖 +output produce&reducer node&依赖 +their&process& +intermediate value&list&依赖 +reducer task&mapper&依赖 +reducer task&output&依赖 +reducer task&input&依赖 +smaller set&tuple&AGGREGATION +their&lists& +intermediate key&reducer&依赖 +intermediate key&sorted key order&依赖 +reducer&zero or more final key/value pair&依赖 +RecordWriter & OutputFormat&Reducer phase&依赖 +RecordWriter & OutputFormat&output file&依赖 +RecordWriter & OutputFormat&output key-value pair&依赖 +output key-value pair&key-value pair&GENERALIZATION +Reducer phase&phase&GENERALIZATION +way&OutputFormat&依赖 +final output&OutputFormat instance&依赖 +final output&hdf&依赖 +final output&reducer&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec67674ebb9e0c48052a55f9d5f847dec7fddfbc --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3-relation.txt @@ -0,0 +1,71 @@ +two major component&Hadoop&AGGREGATION +2020 MapReduce and HDFS&Hadoop&依赖 +library&various different-different optimization&依赖 +library&many programming language&依赖 +it&equivalent task&依赖 +purpose&MapReduce&AGGREGATION +it&it&依赖 +MapReduce task&two phase map phase&依赖 +who&Job&依赖 +Components&MapReduce Architecture&AGGREGATION +who&MapReduce&依赖 +who&processing&依赖 +multiple client&job&依赖 +multiple client&job for ###&依赖 +multiple client&job&依赖 +multiple client&job for ###&依赖 +client&that&依赖 +Hadoop MapReduce Master&particular job&依赖 +Hadoop MapReduce Master&subsequent job-part&依赖 +result&final output&依赖 +result&job-part&AGGREGATION +final result&processing&依赖 +we&MapReduce&依赖 +we&client&依赖 +client&Hadoop MapReduce Master&依赖 +client&job&依赖 +job&particular size&AGGREGATION +client&particular size&依赖 +MapReduce master&job&依赖 +MapReduce master&further equivalent job-part&依赖 +Map and Reduce task&use-case&依赖 +requirement&use-case&AGGREGATION +Map and Reduce task&requirement&依赖 +developer&their logic&依赖 +developer&requirement&依赖 +industry&that&依赖 +their&logic& +input datum&Map Task&依赖 +its&output& +Map&intermediate key-value pair&依赖 +Map&output&依赖 +we&which&依赖 +key-value pair&Reducer&依赖 +final output&hdf&依赖 +output&map i.e.&AGGREGATION +n number&Map and Reduce task&AGGREGATION +algorithm&minimum&依赖 +algorithm&optimized way&依赖 +’s&MapReduce phase&依赖 +’s&architecture&依赖 +its&architecture& +’s&better understanding&依赖 +better understanding&architecture&AGGREGATION +MapReduce task&2 phase i.e. map phase&依赖 +its&use& +key&kind&依赖 +id&kind&AGGREGATION +kind&address and value&AGGREGATION +key&address and value&依赖 +its&repository& +Map ( ) function&memory repository&依赖 +its&pair& +work&Job tracker&AGGREGATION +hundred&data node&AGGREGATION +work&resource&依赖 +Task Tracker&actual slave&依赖 +cluster&Map and Reduce task&依赖 +also one important component&MapReduce Architecture&AGGREGATION +daemon process&task or application&依赖 +daemon process&historical information&依赖 +log&Job History Server&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69f018e1c74d0864b47d113a689c57ed421f90a --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial-relation.txt @@ -0,0 +1,1118 @@ +MapReduce framework&framework&GENERALIZATION +Purpose This document&user-facing facet&依赖 +Hadoop MapReduce framework&MapReduce framework&GENERALIZATION +Purpose This document&Hadoop MapReduce framework&依赖 +user-facing facet&Hadoop MapReduce framework&AGGREGATION +single node setup&first-time user&依赖 +single node setup&first-time user&依赖 +application&large cluster&依赖 +vast amount and multi-terabyte data-set&datum&AGGREGATION +application&commodity hardware&依赖 +application&vast amount and multi-terabyte data-set&依赖 +large cluster&commodity hardware&AGGREGATION +thousand&node&AGGREGATION +MapReduce job&input data-set&依赖 +MapReduce job&job&GENERALIZATION +MapReduce job&independent chunk&依赖 +output&map&AGGREGATION +framework sort&output&依赖 +framework sort&map&依赖 +input&file-system&依赖 +output&job&AGGREGATION +care&scheduling task&AGGREGATION +framework&care&依赖 +framework&scheduling task&依赖 +file system (&same set&依赖 +file system (&same set&依赖 +same set&node&AGGREGATION +file system (&node&依赖 +file system (&node&依赖 +configuration&resulting&依赖 +configuration&framework&依赖 +configuration&task&依赖 +MapReduce framework&single master JobTracker&依赖 +jobs&tasks& +slave&task&依赖 +implementation&appropriate interfaces and/or abstract-class&AGGREGATION +application&input/output locations and supply map&依赖 +job configuration&configuration&GENERALIZATION +Hadoop job client&JobTracker&依赖 +JobTracker&responsibility&依赖 +Hadoop job client&job ( jar/executable etc. ) and configuration&依赖 +JobTracker&job-client&依赖 +JobTracker&software/configuration&依赖 +Hadoop framework&JavaTM&实现 +MapReduce application&Java&依赖 +Hadoop framework&framework&GENERALIZATION +mapper&reducer&依赖 +Hadoop Pipes&compatible C++ API&依赖 +set&< key , value > pair&AGGREGATION +job&different type&AGGREGATION +Inputs and output&input&依赖 +Inputs and output&input&依赖 +key class&WritableComparable interface&实现 +input and output type&MapReduce job&AGGREGATION +we&detail&依赖 +MapReduce application&application&GENERALIZATION +simple application&number&依赖 +simple application&word&依赖 +number&word&AGGREGATION +number&occurence&AGGREGATION +occurence&word&AGGREGATION +simple application&number&依赖 +simple application&word&依赖 +source code wordcount.java 1&source code wordcount.java 1&依赖 +public class wordcount { 13&public class wordcount { 13&依赖 +public void map&IOException { 19&依赖 +public void map&IOException { 19&依赖 +public void map&IOException { 19&依赖 +public void map&IOException { 19&依赖 +while ( tokenizer.hasmoretokens&) ) { 22&依赖 +public void reduce&IOException { 30&依赖 +public void reduce&IOException { 30&依赖 +public void reduce&IOException { 30&依赖 +public void reduce&IOException { 30&依赖 +while ( values.hasnext ( ) ) { 32&while ( values.hasnext ( ) ) { 32&依赖 +public static void main ( string [ ] arg&Exception { 39&依赖 +root&installation and hadoop_version be&AGGREGATION +/&input&GENERALIZATION +/&output&GENERALIZATION +input&/&GENERALIZATION +cat / usr/joe/wordcount / output/part -00000 Bye 1 Goodbye 1 Hadoop 2 Hello 2 World 2 application&comma separated list&依赖 +cat / usr/joe/wordcount / output/part -00000 Bye 1 Goodbye 1 Hadoop 2 Hello 2 World 2 application&path&依赖 +comma separated list&path&AGGREGATION +current working directory&task&AGGREGATION +classpath&map&AGGREGATION +- libjars option&application&依赖 +- libjars option&jar&依赖 +archive&argument&依赖 +archive&comma separated list&依赖 +comma separated list&archive&AGGREGATION +archive&them&依赖 +link&task&依赖 +link¤t working directory&依赖 +name&archive&AGGREGATION +More detail&Commands Guide&依赖 +More detail&Commands Guide&依赖 +myarchive.zip&directory&依赖 +myarchive.zip&name " myarchive.zip "&依赖 +user&##&依赖 +user&different symbolic name&依赖 +txt&task&依赖 +txt&example&依赖 +txt&symbolic name dict1 and dict2&依赖 +archive mytar.tgz&directory&依赖 +archive mytar.tgz&name " tgzdir "&依赖 +mapper implementation ( line&time&依赖 +mapper implementation ( line&one line&依赖 +mapper implementation ( line&map method ( line&依赖 +It&line&依赖 +key-value pair&< , 1 >&AGGREGATION +It&token&依赖 +We&more&依赖 +We&map&依赖 +number&map&AGGREGATION +We&number&依赖 +WordCount&a combiner ( line&依赖 +output&local combiner (&依赖 +output&same&依赖 +output&local aggregation&依赖 +2 > The output&second map&AGGREGATION +output&value&依赖 +output&first map&AGGREGATION +< Bye&JobConf&依赖 +run method&various facet&依赖 +run method&job&依赖 +< Bye&JobConf&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +various facet&job&AGGREGATION +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +command line&line&GENERALIZATION +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +run method&input/output path&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +< Bye&input/output format etc.&依赖 +< Bye&input/output format etc.&依赖 +< Bye&JobConf&依赖 +< Bye&JobConf&依赖 +It&and&依赖 +It&jobclient.runjob ( line&依赖 +its&progress& +We&JobConf&依赖 +We&more&依赖 +We&JobConf&依赖 +reasonable amount&detail&AGGREGATION +user-facing aspect&MapReduce framework&AGGREGATION +their&jobs& +us&Mapper and Reducer interface&依赖 +application&them&实现 +We&JobConf , JobClient , Partitioner , OutputCollector , Reporter , InputFormat , OutputFormat , OutputCommitter and other&依赖 +We&other core interface&依赖 +useful feature&framework&AGGREGATION +payload application&Mapper and Reducer interface&实现 +core&job&AGGREGATION +Mapper Mapper&set&依赖 +set&intermediate key/value pair&AGGREGATION +Mapper Mapper&input key/value pair&依赖 +Mapper Mapper&intermediate key/value pair&依赖 +individual task&input record&依赖 +individual task&intermediate record&依赖 +given input pair&zero or many output pair&依赖 +Hadoop MapReduce framework&one map task&依赖 +Hadoop MapReduce framework&InputSplit&依赖 +map task&task&GENERALIZATION +framework&task&依赖 +framework&map ( writablecomparable , writable , outputcollector , reporter )&依赖 +framework&key/value pair&依赖 +application&required cleanup&依赖 +application&Closeable.close ( ) method&依赖 +Output pair&call&依赖 +Output pair&outputcollector.collect ( writablecomparable , writable )&依赖 +application&Reporter&依赖 +intermediate value&framework&依赖 +user&grouping&依赖 +number&job&依赖 +number&reduce task&依赖 +number&of&AGGREGATION +total number&partition&AGGREGATION +intermediate output&datum&依赖 +amount&datum&AGGREGATION +user&combiner&依赖 +user&jobconf.setcombinerclass ( class )&依赖 +local aggregation&intermediate output&AGGREGATION +intermediate output&amount&依赖 +intermediate , sorted output&value ) format a simple ( key-len&依赖 +intermediate , sorted output&a simple ( key-len&依赖 +intermediate , sorted output&value ) format&实现 +intermediate , sorted output&a simple ( key-len&实现 +CompressionCodec&JobConf&依赖 +block&input file&AGGREGATION +total number&block&AGGREGATION +number&input&依赖 +number&total size&依赖 +total size&input&AGGREGATION +right level¶llelism&AGGREGATION +map&execute&依赖 +map&minute&依赖 +10TB&input datum&AGGREGATION +you&input datum&依赖 +blocksize&128MB&AGGREGATION +you&10TB&依赖 +Reducer Reducer&set&依赖 +smaller set&value&AGGREGATION +intermediate value&key&依赖 +Reducer Reducer&intermediate value&依赖 +intermediate value&value&依赖 +intermediate value&smaller set&依赖 +set&intermediate value&AGGREGATION +Reducer Reducer&Reducer&GENERALIZATION +number&reduce&AGGREGATION +number&user&依赖 +number&jobconf.setnumreducetasks ( int )&依赖 +( list&value&AGGREGATION +framework&( writablecomparable , iterator , outputcollector , reporter ) method&依赖 +Reducer&3 primary phase&依赖 +Reducer&shuffle&依赖 +Shuffle Input&mapper&依赖 +sorted output&mapper&AGGREGATION +Shuffle Input&mapper&依赖 +framework&HTTP&依赖 +framework&relevant partition&依赖 +framework&HTTP&依赖 +output&mapper&AGGREGATION +framework&output&依赖 +relevant partition&output&AGGREGATION +framework&relevant partition&依赖 +framework&output&依赖 +different mapper&same key )&依赖 +one&jobconf.setoutputvaluegroupingcomparator ( class )&依赖 +one&Comparator&依赖 +output&FileSystem&依赖 +output&outputcollector.collect ( writablecomparable , writable )&依赖 +output&reduce task&AGGREGATION +output&Reducer&AGGREGATION +right number&reduce&AGGREGATION +their&round& +faster node&reduce&依赖 +faster node&reduce&依赖 +faster node&first round&依赖 +better job&load balancing&AGGREGATION +faster node&first round&依赖 +first round&reduce&AGGREGATION +cost&failure&AGGREGATION +scaling factor&a few reduce slot&依赖 +scaling factor&speculative-task&依赖 +number&reduce-task&AGGREGATION +output&output path&依赖 +output path&path&GENERALIZATION +output&case&依赖 +output&set&依赖 +output&output path&依赖 +output&set&依赖 +output&case&依赖 +output&FileSystem&依赖 +output&FileSystem&依赖 +output&map-task&AGGREGATION +framework&map-output&依赖 +Partitioner Partitioner&Partitioner&GENERALIZATION +Partitioner Partitioner&key space&依赖 +Partitioner&key&依赖 +key&intermediate map-output&AGGREGATION +Partitioner&intermediate map-output&依赖 +partitioning&key&AGGREGATION +Partitioner&partitioning&依赖 +subset&key )&AGGREGATION +this control&task&依赖 +intermediate key (&for reduction&依赖 +Reporter Reporter&MapReduce application&依赖 +Reporter Reporter&Reporter&GENERALIZATION +Reporter Reporter&progress&依赖 +Mapper and Reducer implementation&Reporter&依赖 +application&time&依赖 +significant amount&time&AGGREGATION +application&significant amount&依赖 +framework&task&依赖 +way&configuration parameter mapred.task.timeout&依赖 +application&counter&依赖 +application&Reporter&依赖 +generalization&facility&AGGREGATION +OutputCollector OutputCollector&facility&依赖 +output&job )&AGGREGATION +library&useful mapper&AGGREGATION +MapReduce job configuration&job configuration&GENERALIZATION +Job Configuration JobConf&MapReduce job configuration&依赖 +JobConf&user&依赖 +framework&job&依赖 +other parameter&rest&依赖 +other parameter&framework and/or job configuration&依赖 +job parameter&setnumreducetasks ( and int ) )&依赖 +rest&framework and/or job configuration&AGGREGATION +JobConf&input file&依赖 +JobConf&set&依赖 +set&input file&AGGREGATION +JobConf&( jobconf , path&依赖 +job task&a speculative manner ( setmapspeculativeexecution&依赖 +percentage&tasks failure&AGGREGATION +other advanced facet&job&AGGREGATION +maximum number&attempt&AGGREGATION +user&course&依赖 +user&/&依赖 +large amount&( read-only ) datum&AGGREGATION +child process&process&GENERALIZATION +TaskTracker&Mapper / Reducer task&依赖 +TaskTracker&child process&依赖 +TaskTracker&separate jvm&依赖 +parent TaskTracker&TaskTracker&GENERALIZATION +environment&parent TaskTracker&AGGREGATION +child-task&environment&依赖 +child-task&parent TaskTracker&依赖 +user&mapr&依赖 +user&child-jvm&依赖 +user&additional option&依赖 +{ map | reduce }&{ map | reduce }&依赖 +configuration parameter¶meter&GENERALIZATION +value&taskid&AGGREGATION +taskid&MapReduce task&AGGREGATION +MapReduce task&task&GENERALIZATION +it&jconsole&依赖 +start&passwordless JVM JMX agent&AGGREGATION +It&maximum heap-size&依赖 +It&map&依赖 +maximum heap-size&map&AGGREGATION +It&child-jvm&依赖 +It&additional path&依赖 +It&java.library.path&依赖 +java.library.path&child-jvm&AGGREGATION +Memory Management Users/admins&maximum virtual memory&依赖 +maximum virtual memory&launched child-task&AGGREGATION +Memory Management Users/admins&launched child-task&依赖 +child.ulimit&kilo byte kb )&依赖 +Environment&Hadoop Daemons&AGGREGATION +part&framework&AGGREGATION +datum&disk&依赖 +datum&frequency&依赖 +concurrency&operation&AGGREGATION +tuning¶meter&AGGREGATION +default limit&Virtual Memory&AGGREGATION +user&Virtual Memory&依赖 +user&default limit&依赖 +user¶meter&依赖 +user&job&依赖 +name type description mapred.task.maxvmem&number&依赖 +task&job&AGGREGATION +name type description mapred.task.maxvmem&byte&依赖 +it&number&依赖 +it&more Virtual Memory&依赖 +mapred.task.maxpmem int&number&依赖 +mapred.task.maxpmem int&byte&依赖 +over-scheduling&task&AGGREGATION +a buffer and metada&accounting buffer&依赖 +Map parameter&a buffer and metada&依赖 +contents&buffer&AGGREGATION +contents&disk&依赖 +contents&background&依赖 +map&output record&依赖 +serialization buffer&threshold&依赖 +on-disk segment&single file&依赖 +record&disk&依赖 +larger buffer&memory&依赖 +number&spill&AGGREGATION +name type description io.sort.mb int&serialization and accounting buffer&依赖 +name type description io.sort.mb int&cumulative size&依赖 +cumulative size&serialization and accounting buffer&AGGREGATION +ratio&serialization&AGGREGATION +serialized record&information&依赖 +serialized record&information&依赖 +serialized record&16 byte&依赖 +serialized record&16 byte&依赖 +serialized record&information&依赖 +16 byte&information&AGGREGATION +serialized record&16 byte&依赖 +serialized record&16 byte&依赖 +its&size& +serialized record&information&依赖 +percentage&probability&依赖 +probability&spill&AGGREGATION +exhaustion&serialization buffer&AGGREGATION +percentage&spill&依赖 +percentage&disk being&依赖 +percentage&space&AGGREGATION +higher value&disk&依赖 +higher value&number&依赖 +higher value&number&依赖 +higher value&spill&依赖 +higher value&spill&依赖 +higher value&spill&依赖 +higher value&disk&依赖 +higher value&disk&依赖 +higher value&number&依赖 +higher value&spill&依赖 +higher value&number&依赖 +higher value&disk&依赖 +contents&disk&依赖 +percentage&buffer&AGGREGATION +their&contents& +contents&background&依赖 +*&* 2 ^ 16&依赖 +maximum number&record&AGGREGATION +higher value&merge&依赖 +higher value&number&依赖 +higher value&eliminate&依赖 +number&merge&AGGREGATION +probability&map task&AGGREGATION +size&map output&AGGREGATION +map output&output&GENERALIZATION +0.66&buffer&AGGREGATION +io.sort.buffer.spill.percent&0.33&依赖 +next spill&all collect record&依赖 +remainder&buffer&AGGREGATION +threshold&other word&依赖 +threshold&trigger&依赖 +record&spill&依赖 +record&combiner&依赖 +Shuffle/Reduce Parameters&output&依赖 +Shuffle/Reduce Parameters&output&依赖 +output&memory&依赖 +intermediate compression&map output&AGGREGATION +option&merge&依赖 +option&merge&依赖 +option&frequency&依赖 +frequency&merge&AGGREGATION +option&frequency&依赖 +option&frequency&依赖 +option&merge&依赖 +number&segment&AGGREGATION +number&same time&依赖 +It&number&依赖 +It&open file and compression codec&依赖 +It&open file and compression codec&依赖 +number&open file and compression codec&AGGREGATION +It&number&依赖 +merge&several pass&依赖 +number&file&AGGREGATION +number&limit&依赖 +limit&map&依赖 +number&sorted map output&AGGREGATION +unit&partition&AGGREGATION +threshold&only frequency&依赖 +only frequency&in-memory merge&AGGREGATION +threshold&in-memory merge&依赖 +threshold&only frequency&依赖 +threshold&in-memory merge&依赖 +memory threshold&threshold&GENERALIZATION +mapred.job.shuffle.merge.percent float&memory threshold&依赖 +mapred.job.shuffle.merge.percent float&fetched map output&依赖 +mapred.job.shuffle.merge.percent float&fetched map output&依赖 +percentage&memory&AGGREGATION +mapred.job.shuffle.merge.percent float&memory threshold&依赖 +mapred.job.shuffle.merge.percent float&fetched map output&依赖 +mapred.job.shuffle.merge.percent float&memory threshold&依赖 +whose&input& +parameter&only frequency&依赖 +parameter&only frequency&依赖 +parameter&in-memory merge&依赖 +parameter&in-memory merge&依赖 +mapred.job.shuffle.input.buffer.percent float&percentage&依赖 +mapred.job.shuffle.input.buffer.percent float&memory&依赖 +it&large and numerous map output&依赖 +memory&framework&依赖 +mapred.job.reduce.input.buffer.percent float&percentage&依赖 +mapred.job.reduce.input.buffer.percent float&memory relative&依赖 +percentage&memory relative&AGGREGATION +map output&disk&依赖 +map output&default&依赖 +larger than 25 percent&memory&AGGREGATION +it&disk&依赖 +combiner&merge&依赖 +one&time&依赖 +part&intermediate merge&AGGREGATION +in-memory map output&intermediate merge&依赖 +Directory Structure&localized cache&依赖 +Directory Structure&local directory , $ { mapred.local.dir } / taskTracker /&依赖 +Directory Structure&localized cache&依赖 +Directory Structure&local directory , $ { mapred.local.dir } / taskTracker /&依赖 +It&multiple local directory&依赖 +filename&semi-random local directory&依赖 +task tracker&localized job directory&依赖 +job&user&AGGREGATION +directory&localized public distributed cache&依赖 +localized public distributed cache&user&依赖 +tasks and job&user&AGGREGATION +localized public distributed cache&tasks and job&依赖 +job&specific user&AGGREGATION +directory&localized private distributed cache&依赖 +tasks and job&specific user&AGGREGATION +localized private distributed cache&tasks and job&依赖 +localized private distributed cache&specific user&依赖 +job&other user&AGGREGATION +It&job&依赖 +It&other user&依赖 +task&space&依赖 +task&them&依赖 +task&scratch space and share file&依赖 +directory&configuration property job.local.dir&依赖 +directory&user&依赖 +directory&API JobConf.getJobLocalDir ( )&依赖 +System property&property&GENERALIZATION +It&System property&依赖 +user&directory&依赖 +user&system.getproperty&依赖 +user&directory&依赖 +user&system.getproperty&依赖 +jars directory&job jar file&依赖 +jars directory&directory&GENERALIZATION +application&file& +It&task&依赖 +It&job start&依赖 +It&jars directory&依赖 +job.jar location&api JobConf.getJar ( )&依赖 +job.jar location&location&GENERALIZATION +job.jar location&application&依赖 +task directory&directory&GENERALIZATION +task directory&structure&依赖 +current working directory&$ taskid/work&依赖 +current working directory&etc .&依赖 +jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 +jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 +jvm&temporary directory&依赖 +directory&jvm reuse&依赖 +jvm&temporary directory&依赖 +jvm&temporary directory&依赖 +jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 +jvm&temporary directory&依赖 +jvm&temporary directory&依赖 +jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 +jvm&$ { mapred.local.dir } / taskTracker / $ user/jobcache / $ jobid /&依赖 +value&temporary directory&AGGREGATION +( user&property mapred.child.tmp&依赖 +property mapred.child.tmp&mapred.child.tmp&GENERALIZATION +( user&value&依赖 +( user&temporary directory&依赖 +( user&map&依赖 +task&directory& +it&work directory&依赖 +Djava.io.tmpdir&tmp dir '&依赖 +absolute path&tmp dir '&AGGREGATION +Djava.io.tmpdir&absolute path&依赖 +child java task&option&依赖 +TMPDIR = '&tmp dir ' )&AGGREGATION +Pipes and streaming&environment variable&依赖 +Pipes and streaming&tmp dir ' )&依赖 +Pipes and streaming&TMPDIR = '&依赖 +mapred.child.tmp&value&依赖 +/ tmp Task JVM Reuse Jobs&task jvm&依赖 +number&same job )&AGGREGATION +task&same job )&AGGREGATION +number&task&AGGREGATION +One&value greater than 1 and ( int&依赖 +task&execution& +id&task&AGGREGATION +start&map input split&AGGREGATION +number&temporary output directory note&依赖 +execution&streaming job&AGGREGATION +mapred.jar string job.jar location&string task&依赖 +mapred.jar string job.jar location&string task&依赖 +name&" mapred " parameter&AGGREGATION +mapred.jar string job.jar location&string task&依赖 +name&streaming job&依赖 +number&byte&AGGREGATION +streaming job&job&GENERALIZATION +name&execution&依赖 +job&mapper/reducer& +task logs standard output ( stdout ) and error ( stderr&task&AGGREGATION +native library&task&依赖 +native library&task&依赖 +native library&task&依赖 +child-jvm&java.library.path and LD_LIBRARY_PATH&依赖 +child-jvm¤t working directory&依赖 +its&directory& +cached library&System.loadLibrary or System.load&依赖 +More detail&native_libraries.html&依赖 +user-job&primary interface&依赖 +user-job&JobTracker&依赖 +component-tasks&reports& +cluster&information& +their&progress& +input and output specification&job&AGGREGATION +DistributedCache&job&AGGREGATION +job&jar& +Job history file&user specified directory hadoop.job.history.user.location&依赖 +file&specified directory&依赖 +file&" _ logs/history /&依赖 +they&default&依赖 +command&job detail&依赖 +User&history logs summary&依赖 +User&command $ bin/hadoop job&依赖 +history&output directory listing&依赖 +history&filter log file&依赖 +history&output directory listing&依赖 +history&OutputLogFilter&依赖 +history&filter log file&依赖 +history&OutputLogFilter&依赖 +user&application&依赖 +Job Authorization Job level authorization and queue level authorization&cluster&依赖 +user&job detail&依赖 +access control check&JobTracker&依赖 +access control check&user&依赖 +job submitter&job&依赖 +job submitter&access control list&依赖 +job submitter&configuration property&依赖 +owner mapred.queue.queue-name .&access&依赖 +queue administrator&queue&AGGREGATION +owner mapred.queue.queue-name .&access&依赖 +job&owner& +job&queue&依赖 +job view ACL&user&依赖 +job view ACL&configured mapreduce.job.acl-view-job&依赖 +job level counter task level counter task&web ui other information&依赖 +its&profile& +job level counter task level counter task&user&依赖 +JobTracker&information& +job level counter task level counter task&web ui other information&依赖 +its&status& +job level counter task level counter task&job&依赖 +job level counter task level counter task&status&依赖 +job level counter task level counter task&status&依赖 +job level counter task level counter task&user&依赖 +tasks&task& +job level counter task level counter task&job&依赖 +job modification ACL&user&依赖 +job modification ACL&configured mapreduce.job.acl-modify-job&依赖 +priority&job&AGGREGATION +operation&queue level acl&依赖 +operation&queue level acl&依赖 +caller&operation&依赖 +he/she&queue admin acl or job modification acl&依赖 +part&queue admin acl or job modification acl&AGGREGATION +format&job level ACL&AGGREGATION +Job Control Users&complex task&依赖 +output&turn&依赖 +output&distributed file-system&依赖 +output&next job&依赖 +output&distributed file-system&依赖 +output&input&依赖 +various job-control option be&such case&依赖 +various job-control option be&such case&依赖 +submitjob ( jobconf )&job&依赖 +jobconf.setjobendnotificationuri ( string )&polling&依赖 +Kerberos&command& +user&secure cluster&依赖 +user&Job Credentials&依赖 +user&' kinit command&依赖 +Job Credentials&Credentials&GENERALIZATION +we&scalability concern&依赖 +we&' ticket&依赖 +we&MapReduce job&依赖 +Kerberos&tickets& +client&Kerberos& +we&delegation token&依赖 +we&them&依赖 +part&job submission&AGGREGATION +delegation token&hdf&依赖 +HDFS system&FileInputFormats&依赖 +hdf&staging directory&依赖 +Other application&configuration " mapreduce.job.hdfs-servers "&依赖 +comma separated list&file system name&AGGREGATION +token&part&依赖 +token&JobTracker&依赖 +token&Credentials&依赖 +token&job submission&依赖 +we&MapReduce delegation token&依赖 +task&job&依赖 +task&JobTracker&依赖 +task&MapReduce delegation token&依赖 +delegation token&JobClient.getDelegationToken&依赖 +delegation token&token&GENERALIZATION +delegation token&API&依赖 +obtained token&credentials&依赖 +credentials&part&依赖 +credentials&JobTracker&依赖 +part&job submission process&AGGREGATION +credentials&job submission process&依赖 +JobTracker&its filesystem (&依赖 +JobTracker&its filesystem (&依赖 +JobTracker&hdf&依赖 +JobTracker&hdf&依赖 +JobTracker&tokens and secret&依赖 +JobTracker&tokens and secret&依赖 +JobTracker&tokens and secret&依赖 +its&filesystem& +JobTracker&its filesystem (&依赖 +JobTracker&hdf&依赖 +TaskTracker&part job localization&依赖 +TaskTracker&file&依赖 +task&environment variable&依赖 +task&HADOOP_TOKEN_FILE_LOCATION&依赖 +task&configuration " mapreduce.job.credentials.binary "&依赖 +HDFS delegation token&JobTracker&依赖 +task&job&依赖 +task&job&依赖 +whose&tasks& +job&same token&依赖 +arbitrary secret&task&依赖 +arbitrary secret&access other third party service&依赖 +arbitrary secret&HDFS delegation token&依赖 +arbitrary secret&job submission&依赖 +Mapper/Reducer class&JobConfigurable&实现 +similar thing&new MapReduce API&依赖 +similar thing&Mapper.setup method&依赖 +task&api&依赖 +task&api&依赖 +task&secret&依赖 +task&secret&依赖 +Job Input InputFormat&input-specification&依赖 +Job Input InputFormat&MapReduce job&依赖 +InputFormat&job to&AGGREGATION +input-specification&job&AGGREGATION +input file&file&GENERALIZATION +sub-class&FileInputFormat&AGGREGATION +default behavior&input&依赖 +default behavior&file-based InputFormat implementation&AGGREGATION +total size&input file&AGGREGATION +byte&input file&AGGREGATION +FileSystem blocksize&input file&AGGREGATION +logical split&many application&依赖 +logical split&many application&依赖 +application&RecordReader&实现 +application&such case&实现 +record-oriented view&logical InputSplit&AGGREGATION +TextInputFormat&given job&依赖 +framework&input-file&依赖 +framework&input-file with ###&依赖 +them&appropriate CompressionCodec&依赖 +its&entirety& +compressed file&single mapper&依赖 +compressed file&entirety&依赖 +InputSplit InputSplit&InputSplit&GENERALIZATION +InputSplit InputSplit&datum&依赖 +it&RecordReader&依赖 +InputSplit&byte-oriented view&依赖 +byte-oriented view&input&AGGREGATION +responsibility&RecordReader&AGGREGATION +InputSplit&input&依赖 +It&path&依赖 +It&logical split&依赖 +It&map.input.file&依赖 +path&input file&AGGREGATION +RecordReader RecordReader&InputSplit&依赖 +RecordReader RecordReader&< key , value > pair&依赖 +RecordReader RecordReader&RecordReader&GENERALIZATION +RecordReader&byte-oriented view&依赖 +RecordReader&input&依赖 +RecordReader&responsibility&依赖 +RecordReader&processing record boundary&依赖 +Job Output OutputFormat&MapReduce job&依赖 +Job Output OutputFormat&output-specification&依赖 +OutputFormat&job to&AGGREGATION +output-specification&job&AGGREGATION +output file&job&AGGREGATION +Output file&FileSystem&依赖 +commit&task output&AGGREGATION +OutputCommitter OutputCommitter&OutputCommitter&GENERALIZATION +OutputCommitter OutputCommitter&MapReduce job&依赖 +OutputCommitter OutputCommitter&commit&依赖 +OutputCommitter&job to&AGGREGATION +MapReduce framework&OutputCommitter&依赖 +MapReduce framework&job to&依赖 +initialization&job&AGGREGATION +Job setup&separate task&依赖 +job&state&依赖 +Job cleanup&separate task&依赖 +Job cleanup&end&依赖 +Job cleanup&job&依赖 +end&job&AGGREGATION +Task setup&same task&依赖 +part&same task&AGGREGATION +Task setup&part&依赖 +Task setup&task initialization&依赖 +task&exception block )&依赖 +Job setup/cleanup task&slot&依赖 +Job setup/cleanup task&map&依赖 +JobCleanup task&task&GENERALIZATION +JobCleanup task&highest priority&依赖 +two instance&same Mapper or Reducer&AGGREGATION +application-writer&using&依赖 +application-writer&unique name&依赖 +output&task-attempt&AGGREGATION +MapReduce framework&_ $ { taskid&依赖 +MapReduce framework&FileSystem&依赖 +successful completion&task-attempt&AGGREGATION +file&task-attempt&依赖 +file&successful completion&依赖 +file&$ { mapred.output.dir }&依赖 +sub-directory&unsuccessful task-attempt&AGGREGATION +framework&sub-directory&依赖 +framework&sub-directory&依赖 +framework&unsuccessful task-attempt&依赖 +framework&unsuccessful task-attempt&依赖 +process&application&依赖 +execution&task&AGGREGATION +$ { mapred.work.output.dir }&task&AGGREGATION +advantage&feature&AGGREGATION +framework&succesful task-attempt&依赖 +application-writer&advantage&依赖 +framework&them&依赖 +application-writer&feature&依赖 +value&MapReduce framework&依赖 +value&$ { mapred.work.output.dir }&AGGREGATION +execution&particular task-attempt&AGGREGATION +output&hdf&依赖 +map&job&AGGREGATION +output&case&依赖 +output&hdf&依赖 +output&case&依赖 +RecordWriter RecordWriter&output < key , value > pair&依赖 +RecordWriter RecordWriter&output file&依赖 +output file&file&GENERALIZATION +RecordWriter implementation&FileSystem&依赖 +RecordWriter implementation&job output&依赖 +Other Useful Features Submitting Jobs&queue&依赖 +Other Useful Features Submitting Jobs&job&依赖 +collection&job&AGGREGATION +queue&acl&依赖 +queue&example&依赖 +who&job&依赖 +who&them&依赖 +Hadoop&single mandatory queue&依赖 +mapred.queue.names property&Hadoop site configuration&AGGREGATION +Queue name&Hadoop site configuration&依赖 +Queue name&mapred.queue.names property&依赖 +job scheduler&support multiple queue&依赖 +job scheduler&support multiple queue&依赖 +job&queue&依赖 +it&' default ' queue&依赖 +job&associated queue name&依赖 +'&queue& +Counters counter&global counter&依赖 +group&type Counters.Group&AGGREGATION +counter&type Counters.Group&依赖 +counter&particular Enum&AGGREGATION +counter&group&依赖 +application&type enum )&依赖 +application&arbitrary counter (&依赖 +arbitrary counter (&type enum )&AGGREGATION +DistributedCache DistributedCache&application-specific , large , read-only file&依赖 +DistributedCache DistributedCache&DistributedCache&GENERALIZATION +application&file&依赖 +/&/&GENERALIZATION +job&node&依赖 +framework&task&依赖 +framework&necessary file&依赖 +framework&necessary file&依赖 +slave node&node&GENERALIZATION +efficiency&fact&依赖 +Its&efficiency& +DistributedCache&cached file&依赖 +DistributedCache&modification timestamp&依赖 +modification timestamp&cached file&AGGREGATION +cache file&application or externally&依赖 +archive&slave node&依赖 +{ file&| archives }&依赖 +they&comma separated path&依赖 +property&distributedcache.addcachearchive ( uri , conf )&依赖 +property&api distributedcache.addcachefile ( uri , conf )&依赖 +user&DistributedCache&依赖 +name&symlink&AGGREGATION +DistributedCache&URI&依赖 +DistributedCache&URI&依赖 +DistributedCache&fragment&依赖 +DistributedCache&URI&依赖 +fragment&URI&AGGREGATION +DistributedCache&fragment&依赖 +DistributedCache&fragment&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +symlink name&name&GENERALIZATION +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +uri hdf&symlink name&依赖 +task&cwd& +uri hdf&lib.so&依赖 +uri hdf&symlink name&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +uri hdf&lib.so&依赖 +DistributedCache&reduce&依赖 +DistributedCache&use&依赖 +DistributedCache&rudimentary software distribution mechanism&依赖 +classpath&child-jvm&AGGREGATION +distributedcache.addarchivetoclasspath ( path&cache files/jars&依赖 +directory&task&AGGREGATION +they&slave node&依赖 +whose job&file&依赖 +whose&jobs& +Private " DistributedCache file&local directory&依赖 +file&specific user&依赖 +file&tasks and job&依赖 +its&permissions& +virtue&permission&AGGREGATION +directory path&lookup&依赖 +file&world readable access&依赖 +directory path&lookup&依赖 +directory path&world executable access&依赖 +directory path&world executable access&依赖 +directory path&path&GENERALIZATION +" Public " DistributedCache file&global directory&依赖 +file&tasks and job&依赖 +file&user&依赖 +file&slave&依赖 +file&user&依赖 +Tool The Tool interface&handling&依赖 +handling&generic Hadoop command-line option&AGGREGATION +Tool The Tool interface&generic Hadoop command-line option&依赖 +Tool&MapReduce tool or application&依赖 +application&standard command-line option&依赖 +its&arguments& +handling&standard command-line option&AGGREGATION +application&handling&依赖 +TaskTracker&directory& +$ cd / taskTracker /&$ bin/hadoop org.apache.hadoop.mapred.IsolationRunner&依赖 +failed task&node&依赖 +IsolationRunner&same input&依赖 +IsolationRunner&single jvm&依赖 +IsolationRunner&failed task&依赖 +IsolationRunner&map task&依赖 +3 ) sample&built-in java profiler&AGGREGATION +sample&map&AGGREGATION +User&profiler information&依赖 +profiler information&user log directory&依赖 +profiling&default&依赖 +profiling&job&依赖 +she/he&configuration property mapred.task.profile&依赖 +configuration property&property&GENERALIZATION +{ maps |&MapReduce task&依赖 +{ maps |&range&依赖 +{ maps |&reduce }&依赖 +range&MapReduce task&AGGREGATION +specified range&default&依赖 +User&profiler configuration argument&依赖 +string&a %&依赖 +it&name&依赖 +it&profiling output file&依赖 +name&profiling output file&AGGREGATION +parameter&command line&依赖 +parameter&task child JVM&依赖 +file = %&file = %&依赖 +user&debug script&依赖 +task&stdout& +output&console diagnostic&依赖 +script&stdout& +part&job uus&AGGREGATION +we&debug script&依赖 +user&DistributedCache&依赖 +a quick way&value&依赖 +a quick way&property&依赖 +a quick way&map&依赖 +debug script&command-line option&依赖 +debug script&streaming mode&依赖 +streaming mode&mode&GENERALIZATION +task&files& +Pipes program&c++ program name&依赖 +Pipes program&command&依赖 +Pipes program&fifth argument&依赖 +their&dependencies& +set&MapReduce job&AGGREGATION +utility&MapReduce job&依赖 +utility&set&依赖 +job-outputs i.e. output&reduce&AGGREGATION +both performance ( zlib ) and non-availability&Java library&AGGREGATION +native implementation&above compression codec&AGGREGATION +reason&both performance ( zlib ) and non-availability&AGGREGATION +their&usage& +compression&intermediate map-output&AGGREGATION +Intermediate Outputs application&compression&依赖 +Intermediate Outputs application&intermediate map-output&依赖 +Intermediate Outputs application&intermediate map-output&依赖 +Intermediate Outputs application&compression&依赖 +job outputs application&compression&依赖 +job outputs application&fileoutputformat.setcompressoutput ( jobconf , boolean ) apus&依赖 +compression&job-output&AGGREGATION +job outputs application&fileoutputformat.setoutputcompressorclass ( jobconf&依赖 +job outputs application&class ) apus and fileoutputformat.setoutputcompressorclass ( jobconf&依赖 +require sequencefile.compressiontype ( i.e. record&sequencefileoutputformat.setoutputcompressiontype ( jobconf , sequencefile.compressiontype ) apus&依赖 +certain set&bad input record&AGGREGATION +application&feature&依赖 +application&SkipBadRecords class&依赖 +map task crash&certain input&依赖 +map task crash&certain input&依赖 +user&bug&依赖 +source code&which&依赖 +bug&example&依赖 +task&such case&依赖 +task&multiple attempt&依赖 +small portion&datum&AGGREGATION +feature&default&依赖 +'&certain number&依赖 +'&map failure&依赖 +'&mode&依赖 +certain number&map failure&AGGREGATION +map task&record&依赖 +' skip mode '&' skip mode '&依赖 +map task&range&依赖 +range&record&AGGREGATION +framework&processed record counter&依赖 +skipbadrecords.counter _ map_processed_records and skipbadrecords.counter _ reduce_processed_groups&skipbadrecords.counter _ map_processed_records and skipbadrecords.counter _ reduce_processed_groups&依赖 +counter&framework&依赖 +what record range&task&依赖 +what record range&what record range&依赖 +range&further attempt&依赖 +number&record&AGGREGATION +processed record counter&application&依赖 +application&processing&依赖 +their&processing& +framework&additional record&依赖 +framework&bad record&依赖 +framework&bad record&依赖 +framework&additional record&依赖 +user&skipped record&依赖 +number&skipped record&AGGREGATION +user&number&依赖 +user&skipped record&依赖 +user&skipped record&依赖 +user&number&依赖 +user&number&依赖 +framework&range&依赖 +range&skipped record&AGGREGATION +framework&binary search-like approach&依赖 +framework&skipped record&依赖 +skipped range&two half&依赖 +framework&bad record&依赖 +number&task attempt&AGGREGATION +Skipped record&sequence file format&依赖 +Skipped record&hdf&依赖 +Skipped record&later analysis&依赖 +location&skipbadrecords.setskipoutputpath ( jobconf , path )&依赖 +many&feature&AGGREGATION +more complete WordCount&feature&依赖 +more complete WordCount&many&依赖 +it&pseudo-distributed or fully-distributed Hadoop installation&依赖 +public class WordCount&Tool { 14&实现 +private boolean casesensitive = true ; 23&private boolean casesensitive = true ; 23&依赖 +private long numrecord = 0&26&依赖 +private long numrecord = 0&private long numrecord = 0&依赖 +public void configure ( jobconf job&) { 29&依赖 +JobConf&job&GENERALIZATION +( job.getboolean&false ) ) { 33&依赖 +( job.getboolean&false ) ) { 33&依赖 +private void parseskipfile ( path patternsfile&) { 46&依赖 +file&+& +public void map&) throw ioexception { 58&依赖 +public void map&) throw ioexception { 58&依赖 +public void map&) throw ioexception { 58&依赖 +public void map&) throw ioexception { 58&依赖 +while ( tokenizer.hasmoretokens&) ) { 66&依赖 +100 ) == 0 ) { 72&( ( + + numrecord&依赖 +100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 +100 ) == 0 ) { 72&( ( + + numrecord&依赖 +100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 +100 ) == 0 ) { 72&100 ) == 0 ) { 72&依赖 +100 ) == 0 ) { 72&( ( + + numrecord&依赖 +intwritable > { 78&intwritable > { 78&依赖 +public void reduce&) throw ioexception { 79&依赖 +public void reduce&) throw ioexception { 79&依赖 +public void reduce&) throw ioexception { 79&依赖 +public void reduce&) throw ioexception { 79&依赖 +while ( values.hasnext ( ) ) { 81&while ( values.hasnext ( ) ) { 81&依赖 +public int&Exception { 88&依赖 +public int&Exception { 88&依赖 +public int&Exception { 88&依赖 +} else { 107&} else { 107&依赖 +public static void main ( string [ ] arg&Exception { 119&依赖 +they&output&依赖 +plug-in a pattern-file&word-pattern&依赖 +let&DistributedCache&依赖 +let&DistributedCache&依赖 +let&plug-in a pattern-file&依赖 +let&plug-in a pattern-file&依赖 +second version&previous one&依赖 +2 highlight&usr/joe/wordcount / output/part -00000 bye 1 goodbye 1 hadoop 2 hello 2 world&依赖 +second version&WordCount&AGGREGATION +second version&previous one&依赖 +application&configuration parameter&依赖 +configure method&mapper ( and reducer ) implementation ( line&AGGREGATION +it&word-pattern&依赖 +it&user&依赖 +it&skip&依赖 +utility&Tool interface&AGGREGATION +application&counters ( line 68 )&依赖 +they&application-specific status information&依赖 +they&Reporter instance&依赖 +registered trademark&Sun Microsystems , Inc.&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a30a228867617e34829ac71e2a021f989105bf7 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components-relation.txt @@ -0,0 +1,97 @@ +MapReduce&developer&依赖 +huge amount&unstructured datum&AGGREGATION +MapReduce&unstructured datum&依赖 +MapReduce&NCache cluster&依赖 +MapReduce¶llel&依赖 +MapReduce&huge amount&依赖 +MapReduce&developer&依赖 +MapReduce¶llel&依赖 +NCache cluster&cluster&GENERALIZATION +MapReduce&unstructured datum&依赖 +MapReduce&NCache cluster&依赖 +MapReduce&huge amount&依赖 +MapReduce&size&依赖 +cluster&size&AGGREGATION +MapReduce&node&依赖 +MapReduce&cluster&依赖 +MapReduce¶llel&依赖 +term “ MapReduce ”&two distinct phase&依赖 +‘ Map ’ phase&set&依赖 +set&datum&AGGREGATION +‘ Map ’ phase&datum&依赖 +‘ Reduce ’ phase&output&依赖 +‘ Reduce ’ phase&‘ Map ’&依赖 +‘ Reduce ’ phase&‘ Map ’&依赖 +‘ Reduce ’ phase&output&依赖 +user&key-value pair&依赖 +user&set&依赖 +user&intermediate key-value pair&依赖 +set&intermediate key-value pair&AGGREGATION +user&key-value pair&依赖 +user&intermediate key-value pair&依赖 +user&set&依赖 +Reducer&intermediate key-value pair&依赖 +Reducer&having&依赖 +Reducer&having&依赖 +Reducer&intermediate key-value pair&依赖 +example&combiner )&依赖 +cluster&three node&AGGREGATION +example&a mapreduce task (&依赖 +task&order&依赖 +task&product&依赖 +task&Mapper and extracts count&依赖 +task&order&依赖 +Mapper and extracts count&product&AGGREGATION +Reducer&node& +aggregated count&final aggregation&依赖 +count&figure 2&依赖 +aggregated count&Reducer node&依赖 +Mapper&output& +Reducer node&node&GENERALIZATION +two (&Combining&依赖 +two (&Combining&依赖 +aggregation and compilation&final result&AGGREGATION +care&aggregation and compilation&AGGREGATION +Combine phase&performance&依赖 +it&network traffic&依赖 +it&Mapper and Reducers&依赖 +NCache MapReduce&MapReduce&GENERALIZATION +NCache MapReduce&three phase&依赖 +NCache MapReduce&Map&依赖 +its&reducer& +NCache MapReduce&default reducer&依赖 +user&Reducer&实现 +Default reducer&output&依赖 +Default reducer&output&依赖 +mapper , combiner and reducer&NCache MapReduce task&依赖 +mapper , combiner and reducer&NCache cluster&依赖 +Mapper output&Combiner&依赖 +Mapper output&output&GENERALIZATION +it&Reducer&依赖 +Combiner&output& +Reducer&output&依赖 +’s output&specified chunk size&依赖 +Mapper&chunk& +Number&task&AGGREGATION +combiner or reducer once output chunk&configured chunk size&依赖 +typical MapReduce task&components :&依赖 +Mapper&initial input&依赖 +Combiner Factory&combiner&依赖 +Combiner Factory&combiner&依赖 +Combiner Factory&combiner&依赖 +its&keys& +Key Filter&filter cache datum&依赖 +Key Filter&user&依赖 +KeyFilter&Mapper phase&依赖 +Map&key&依赖 +it&false&依赖 +Mapper&key&依赖 +its&status& +component&track&依赖 +component&progress&依赖 +component&task&依赖 +progress&task&AGGREGATION +track&progress&AGGREGATION +output&task&AGGREGATION +you&output&依赖 +you&task&依赖 diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" new file mode 100644 index 0000000000000000000000000000000000000000..dffaa920fc61591c098ac7314827be8a24c8cb42 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components-relation.txt" @@ -0,0 +1,22 @@ +Split&logical representation&依赖 +logical representation&block&AGGREGATION +Split&block&依赖 +map-reduce 1 mapper&1 split&依赖 +map-reduce 1 mapper&map-reduce 1 mapper&依赖 +map-reduce 1 mapper&time&依赖 +10 mapper&input file&依赖 +block size&10 split&依赖 +We&client , master and slave&依赖 +Client&job&依赖 +We&job&依赖 +we&mapper and reducer&依赖 +we&program&依赖 +We&program&依赖 +We&job&依赖 +We&sub-division&依赖 +sub-division&job&AGGREGATION +job&smaller task&依赖 +Master&multiple task&依赖 +Master&work or job&依赖 +actual work&slave&依赖 +Master&job&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b4a842419b3cfa5cf143310475c8b7102613bb7 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce-relation.txt @@ -0,0 +1,118 @@ +core component&Apache Hadoop software framework&AGGREGATION +MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 +MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 +MapReduce Stephen J. Bigelow&Apache Hadoop software framework&依赖 +distributed processing&commodity computer cluster&依赖 +distributed processing&commodity computer cluster&依赖 +node&cluster&AGGREGATION +node&own storage&依赖 +its&storage& +distributed processing&massive unstructured datum set&AGGREGATION +distributed processing&commodity computer cluster&依赖 +it&result&依赖 +MapReduce&two essential function&依赖 +it&result&依赖 +it&node&依赖 +it&node&依赖 +it&node&依赖 +it&node&依赖 +it&result&依赖 +it&result&依赖 +MapReduce&original version&依赖 +original version&MapReduce&AGGREGATION +MapReduce&MapReduce&依赖 +master node&jobs and resource&依赖 +master node&cluster&依赖 +master node&node&GENERALIZATION +component&completed job&依赖 +previous JobTracker and TaskTracker daemon&introduction&依赖 +previous JobTracker and TaskTracker daemon&component&依赖 +introduction&mapreduce and hadoop version&AGGREGATION +previous JobTracker and TaskTracker daemon&another resource negotiator ( yarn )&依赖 +component&another resource negotiator ( yarn )&AGGREGATION +previous JobTracker and TaskTracker daemon&mapreduce and hadoop version&依赖 +ResourceManager&master node&依赖 +submission and scheduling&job&AGGREGATION +It&job&依赖 +NodeManager&slave node&依赖 +NodeManager&other daemon&依赖 +slave node&node&GENERALIZATION +MapReduce&massive cluster size&依赖 +MapReduce¶llel&依赖 +number&server&AGGREGATION +job&number&依赖 +cluster size&final result&依赖 +job&server&依赖 +job&results& +MapReduce&software development&实现 +MapReduce&C , C++ , Java , Ruby , Perl and Python&依赖 +MapReduce&several language&依赖 +programmer&MapReduce library&依赖 +node&status&依赖 +node&master node&依赖 +its&status& +master node&piece&依赖 +master node&cluster&依赖 +master node&job&依赖 +piece&job&AGGREGATION +master node&other available node&依赖 +its&ability& +result&node&AGGREGATION +power&MapReduce&AGGREGATION +user&time&依赖 +user&number&依赖 +user&time&依赖 +number&time&AGGREGATION +user&number&依赖 +user&26 people&依赖 +separate sheet&paper&AGGREGATION +user&task&依赖 +user&contrast&依赖 +map aspect&MapReduce&AGGREGATION +her&place& +MapReduce&element& +user&26 box&依赖 +user&single-word page&依赖 +26 box&first letter&依赖 +first letter&word&AGGREGATION +their&pages& +26 box&word&依赖 +user&a box and sort&依赖 +user&stack alphabetically&依赖 +number&reduce aspect&依赖 +number&MapReduce&依赖 +example&reduce aspect&AGGREGATION +number&reduce aspect&依赖 +number&reduce aspect&依赖 +number&MapReduce&依赖 +reduce aspect&MapReduce&AGGREGATION +number&page&AGGREGATION +number&MapReduce&依赖 +broad range&real-world use&AGGREGATION +social networking site&example&依赖 +social networking site&MapReduce&依赖 +users&friends& +historical behavior&user&AGGREGATION +booking website&MapReduce&依赖 +industrial facility&different sensor&依赖 +industrial facility&equipment datum&依赖 +industrial facility&installation&依赖 +Many business&capital and overhead&依赖 +Hadoop and MapReduce&enormous scalability&依赖 +organization&public cloud service&依赖 +organization&result&依赖 +Hadoop and MapReduce&minimal capital cost&依赖 +organization&Hadoop and MapReduce&依赖 +its&offering& +HDInsight service&provision Hadoop&依赖 +its&service& +Microsoft Azure&HDInsight service&依赖 +HDInsight service&user&依赖 +Hadoop and MapReduce&one option&依赖 +Hadoop and MapReduce&organization&依赖 +Spark and Hadoop cluster&private , on-premises big data infrastructure&依赖 +Organizations&Apache Spark&依赖 +Organizations&other platform&依赖 +big data framework&type&依赖 +big data framework&processing task&依赖 +type&processing task&AGGREGATION diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad7368a72f861cdf2ab583c4f42cda545e7b5d1d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop-relation.txt @@ -0,0 +1,223 @@ +component&Apache Hadoop ecosystem&AGGREGATION +Get start&MapReduce&依赖 +Apache Hadoop ecosystem&massive data processing&依赖 +hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&framework&依赖 +Apache Hadoop ecosystem&Hadoop ecosystem&GENERALIZATION +Get start&hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&依赖 +hadoop december 6 , 2020 facebooktwitterlinkedin mapreduce&Apache Hadoop ecosystem&依赖 +Other component&Apache Hadoop&AGGREGATION +MapReduce component&dispersed and parallel algorithm&依赖 +MapReduce component&processing&依赖 +MapReduce component&Hadoop ecosystem&依赖 +MapReduce component&massive datum&依赖 +processing&massive datum&AGGREGATION +understanding&MapReduce&AGGREGATION +MapReduce&real-life application&依赖 +It&reader&依赖 +It&insight&依赖 +vast volume&datum&AGGREGATION +write application&datum&依赖 +write application&large cluster&依赖 +vast amount&datum&AGGREGATION +write application&vast amount&依赖 +Hadoop framework&framework&GENERALIZATION +we&programming model&依赖 +we&computer cluster&依赖 +we&large dataset&依赖 +application&datum&依赖 +enormous volume&datum&AGGREGATION +It&enormous volume&实现 +It&datum&实现 +We&former task&依赖 +We&former task&依赖 +we&chunk&依赖 +input dataset&dataset&GENERALIZATION +we&map job&依赖 +we&input dataset&依赖 +map job&job&GENERALIZATION +Map task&chunk&依赖 +Map task&task&GENERALIZATION +Map task¶llell&依赖 +we&reduce task&依赖 +map&reduce task&依赖 +we&input&依赖 +we&output&依赖 +reducers process&intermediate datum&依赖 +reducers process&map&依赖 +final output&framework&AGGREGATION +reducers process&map&依赖 +reducers process&intermediate datum&依赖 +reducers process&intermediate datum&依赖 +reducers process&map&依赖 +smaller tuple&task&依赖 +reducers process&map&依赖 +reducers process&intermediate datum&依赖 +MapReduce framework&framework&GENERALIZATION +MapReduce framework&task&依赖 +MapReduce framework&scheduling and monitoring&依赖 +scheduling and monitoring&task&AGGREGATION +failed task&framework&依赖 +framework&distributed processing&依赖 +framework&programmer&依赖 +framework&little expertise&依赖 +MapReduce&overview&依赖 +MapReduce&MapReduce Architecture and MapReduce ’s phase&依赖 +overview&MapReduce Architecture and MapReduce ’s phase&AGGREGATION +MapReduce&overview&依赖 +MapReduce&MapReduce Architecture and MapReduce ’s phase&依赖 +diagram&MapReduce architecture&依赖 +MapReduce architecture&various component&依赖 +brief description&understanding&依赖 +brief description&component&AGGREGATION +brief description&works&依赖 +our&understanding& +piece&actual work&AGGREGATION +MapReduce job&job&GENERALIZATION +MapReduce job&many small task&依赖 +task tracker&tracker&GENERALIZATION +tracker&scheduling job&依赖 +tracker&role&依赖 +status&task&AGGREGATION +tracker&task&依赖 +job tracker&tracker&GENERALIZATION +result&mapping and reduce&AGGREGATION +a program or application programming&MapReduce&依赖 +a program or application programming&job&依赖 +MapReduce&job&依赖 +MapReduce&many client&依赖 +division&main job&AGGREGATION +client&job&依赖 +client&job&依赖 +client&MapReduce Master&依赖 +client&MapReduce Master&依赖 +master&job&依赖 +master&equal sub-part&依赖 +job-part&two main task&依赖 +job-part&MapReduce&依赖 +requirement&organization or company&AGGREGATION +developer&logic&依赖 +reducer&output&依赖 +reducer&final output&依赖 +MapReduce program&program&GENERALIZATION +diagram&simplified flow diagram&依赖 +diagram&MapReduce program&实现 +trackers&work&依赖 +job&two key component&依赖 +job&map task&依赖 +map task&role&依赖 +map task&job-part&依赖 +map task&task&GENERALIZATION +map task&splitting job&依赖 +reduce task&role&依赖 +reduce task&shuffling&依赖 +job tracker&act&依赖 +job tracker&master&依赖 +job tracker&master&依赖 +job tracker&act&依赖 +It&job&依赖 +job tracker schedule job&job tracker schedule job&依赖 +It&job&依赖 +task tracker&map task&依赖 +Task tracker&job tracker&依赖 +Task tracker&assigned job&依赖 +Task tracker&status&依赖 +status&assigned job&AGGREGATION +diagram&work&依赖 +MapReduce program&three main phase&依赖 +phase&MapReduce&AGGREGATION +combiner phase&phase&GENERALIZATION +first phase&program&AGGREGATION +Mapping Phase This&program&依赖 +splitting step&step&GENERALIZATION +dataset&equal unit&依赖 +dataset&splitting step&依赖 +splitting step&input split&依赖 +Hadoop&RecordReader&依赖 +splitting step&TextInputFormat&依赖 +key-value pair&mapping step&依赖 +key-value pair&input&依赖 +mapper&that&依赖 +mapping step&logic&依赖 +mapping step&step&GENERALIZATION +output&same form and ( key-value pair&AGGREGATION +mapper&key-value pair&依赖 +mapper&step&依赖 +second phase&completion&依赖 +second phase&Mapping phase&依赖 +completion&Mapping phase&AGGREGATION +Mapping phase&phase&GENERALIZATION +second phase&place&依赖 +It&two main step ###&依赖 +It&two main step&依赖 +shuffling phase&duplicate value&依赖 +shuffling phase&removal&依赖 +removal&duplicate value&AGGREGATION +grouping&value&AGGREGATION +output&phase&AGGREGATION +output&Reducer phase&依赖 +output&reducer phase&依赖 +output&shuffling phase&AGGREGATION +output&input&依赖 +shuffling phase&phase&GENERALIZATION +Reducer phase&phase&GENERALIZATION +reducer&input&依赖 +summary&entire dataset&AGGREGATION +output&hdf&依赖 +diagram&example&依赖 +example&MapReduce&AGGREGATION +diagram&three main phase&依赖 +Example&MapReduce&AGGREGATION +duplicate output&phase&依赖 +duplicate output&single output&依赖 +combiner phase&Shuffling phase&依赖 +Shuffling phase&phase&GENERALIZATION +performance&Jobs&AGGREGATION +four phase&MapReduce&AGGREGATION +benefit&Hadoop MapReduce Speed&AGGREGATION +MapReduce&huge unstructured datum&依赖 +MapReduce&short time&依赖 +MapReduce framework&failure&依赖 +scale-out feature&process or store datum&依赖 +Hadoop&scale-out feature&依赖 +scale-out feature&cost-effective manner&依赖 +scale-out feature&user&依赖 +MapReduce&user&依赖 +MapReduce&application&依赖 +replica&network&依赖 +replica&various node&依赖 +replica&datum&AGGREGATION +event&failure&AGGREGATION +copy&datum&AGGREGATION +multiple job-part&same dataset&AGGREGATION +multiple job-part&MapReduce&依赖 +multiple job-part¶llel manner&依赖 +practical application&MapReduce program&AGGREGATION +hadoop mapreduce&practical application&依赖 +hadoop mapreduce&MapReduce program&依赖 +application&hadoop mapreduce&AGGREGATION +E-commerce E-commerce company&MapReduce&依赖 +E-commerce E-commerce company&MapReduce&依赖 +Social network&certain information&依赖 +Social network&Facebook , Twitter , and LinkedIn&依赖 +Social network&Facebook , Twitter , and LinkedIn&依赖 +Social network&certain information&依赖 +Social network&social media platform&依赖 +Social network&social media platform&依赖 +who&status&依赖 +It&important information&依赖 +your&status& +who&profile&依赖 +It&status&依赖 +your&profile& +Entertainment Netflix&MapReduce&依赖 +clicks and log&online customer&AGGREGATION +customer&interests and behavior&依赖 +information&movie&依赖 +crucial processing component&Hadoop framework&AGGREGATION +Conclusion MapReduce&MapReduce&GENERALIZATION +Conclusion MapReduce&Hadoop framework&依赖 +quick , scalable , and cost-effective program&huge datum&依赖 +It&quick , scalable , and cost-effective program&依赖 +company&framework&依赖 +company&framework&依赖 +their&strategies& diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..b770a957054fafed7d850f2371eade4b589219f1 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021-relation.txt @@ -0,0 +1,123 @@ +enormous measure&datum&AGGREGATION +Reduce task&datum&依赖 +mapping and splitting&datum&AGGREGATION +Our&Programs& +you&privacy policy&依赖 +our&policy& +whatsapp & other means&communication&AGGREGATION +project&MapReduce&AGGREGATION +enormous scope&data examination&AGGREGATION +MapReduce Architecture Components&MapReduce Architecture 1&AGGREGATION +two significant part&Hadoop&AGGREGATION +MAPREDUCE ARCHITECTURE HDFS and MapReduce architecture&Hadoop&依赖 +MapReduce task&2 phase&依赖 +MapReduce&many programming language&依赖 +MapReduce&different diverse various improvement&依赖 +Map Phase Reduce&library&依赖 +map every one&position&AGGREGATION +motivation&position&依赖 +motivation&map every one&依赖 +motivation&map every one&依赖 +motivation&position&依赖 +it&comparable undertaking&依赖 +it&it&依赖 +motivation&map every one&依赖 +motivation&position&依赖 +who&Job&依赖 +who&MapReduce&依赖 +MAPREDUCE ARCHITECTURE Components&MapReduce Architecture&AGGREGATION +numerous client&work&依赖 +Hadoop MapReduce Master&position part&依赖 +Hadoop MapReduce Master&specific occupation&依赖 +aftereffect&last yield&依赖 +aftereffect&work part&AGGREGATION +end-product&preparation&依赖 +we&MapReduce Architecture&依赖 +we&client&依赖 +client&specific size&依赖 +client&Hadoop MapReduce Master&依赖 +job&specific size&AGGREGATION +client&job&依赖 +MapReduce expert&additional identical job part&依赖 +MapReduce expert&job&依赖 +Map&programming&依赖 +Map&necessity&依赖 +necessity&utilization case&AGGREGATION +Map&tackling&依赖 +engineer&rationale&依赖 +business&that&依赖 +their&rationale& +we&which&依赖 +Map&moderate key-esteem pair&依赖 +Map&yield&依赖 +its&yield& +output&Map&AGGREGATION +last yield&Hadoop Distributed File System&依赖 +last yield&hdf&依赖 +n number&MapReduce assignment&AGGREGATION +calculation&upgraded way&依赖 +calculation&extent&依赖 +calculation&least&依赖 +we&MapReduce phase&依赖 +its&architecture& +comprehension&architecture&AGGREGATION +MapReduce Architecture&example , Map phase and Reduce phase&依赖 +MapReduce Architecture&two phase&依赖 +its&use& +principle use&key-esteem set&依赖 +principle use&input datum&依赖 +sort&address&AGGREGATION +id&sort&AGGREGATION +key&address&依赖 +key&sort&依赖 +Map ( ) capacity&memory vault&依赖 +its&vault& +Map ( ) capacity&input key-esteem pair&依赖 +one&input key-esteem pair&AGGREGATION +Map ( ) capacity&one&依赖 +middle&key-esteem&AGGREGATION +reducer total&data-dependent&依赖 +reducer total&key-esteem pair&依赖 +its&pair& +reducer total&key-esteem pair&依赖 +reducer total&key-esteem pair&依赖 +reducer total&key-esteem pair&依赖 +reducer total&data-dependent&依赖 +reducer total&data-dependent&依赖 +reducer total&key-esteem pair&依赖 +reducer total&data-dependent&依赖 +reducer total&key-esteem pair&依赖 +reducer total&data-dependent&依赖 +reducer total&key-esteem pair&依赖 +reducer total&data-dependent&依赖 +reducer total&data-dependent&依赖 +reducer total&key-esteem pair&依赖 +reducer total&key-esteem pair&依赖 +reducer total&data-dependent&依赖 +reducer total&data-dependent&依赖 +Task Tracker&MapReduce Architecture&依赖 +Task Tracker&Task Tracker&依赖 +It&real slave&依赖 +one&MapReduce task&依赖 +Task Tracker&one&依赖 +Task Tracker&node&依赖 +one&node&AGGREGATION +additionally one significant segment&MapReduce Architecture&AGGREGATION +plan&Hadoop&AGGREGATION +plan&different objective&依赖 +different objective&it&依赖 +different objective&you&依赖 +hadoop mapreduce architecture diagram&hadoop mapreduce architecture diagram&依赖 +Hadoop MapReduce framework architecture&three significant layer&依赖 +MapReduce Architecture system&monstrous information&依赖 +MapReduce Architecture system&mind-boggling interaction&依赖 +other supporting square&Hadoop&AGGREGATION +you&information examiner&依赖 +you&well-known programming language&依赖 +you&profession&依赖 +you&data science field&依赖 +you&point&依赖 +their&jobs& +Academy&Program& +’s postgraduate certificate program&Cloud Computing&依赖 +’s postgraduate certificate program&Cloud aspirant&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffe9e69078f112a2ae5aac6652660a4fe3ac62dd --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce-relation.txt @@ -0,0 +1,7 @@ +component&MapReduce JobTracker&AGGREGATION +master&job&依赖 +main component&mapreduce job tracker job tracker&AGGREGATION +JobTracker&TaskTrackers&依赖 +JobTracker&job&依赖 +status&task&AGGREGATION +TaskTracker&task&依赖 diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e85145312b8c8a0fea6b3702d314e2ffac723b --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2-relation.txt @@ -0,0 +1,520 @@ +each a set&key/value pair&AGGREGATION +mapreduce & hadoop ii mingshen sun chinese university&hong kong mssun@cse.cuhk.edu.hk mingshen sun ( cuhk ) mapreduce & hadoop outline • mapreduce recap • design pattern&AGGREGATION +set&value&AGGREGATION +set&intermediate key/value pairs • Reduce&AGGREGATION +number&partition&AGGREGATION +value&reducer 3 mingshen sun ( cuhk ) mapreduce & hadoop mapreduce recap • optional&依赖 +set&output value • MapReduce framework guarantee&AGGREGATION +v2 ) •&intermediate key/value pairs • Reduce&依赖 +v2 ) •&set&依赖 +simple hash&key and e.g.&AGGREGATION +v3 ) •&set&依赖 +v3 ) •&output value • MapReduce framework guarantee&依赖 +v2 ’ ) • mini-reducer&later 4 mingshen sun ( cuhk ) mapreduce & hadoop mapreduce recap 5 30 chapter 2&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&MapReduce&AGGREGATION +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 +we&multitude&依赖 +a 1 5 b 2 7 c 2 9 8 p p p p reducer reducer reducer x&7 z 9 figure 2.4&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 +multitude&algorithm&AGGREGATION +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&2 b&依赖 +we&multitude&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 5 c&依赖 +we&algorithm&依赖 +we&algorithm&依赖 +7 c 8 partitioner partitioner partitioner partitioner shuffle and sort&a 1 b 2 c 9&依赖 +all term t 2 doc d&4&依赖 +[ c1&] ) 3&依赖 +[ c1&] ) 3&依赖 +[ c1&] ) 3&依赖 +[ c1&sum 0 4&依赖 +[ c1&sum 0 4&依赖 +[ c1&sum 0 4&依赖 +[ c1&] ) 3&依赖 +[ c1&sum 0 4&依赖 +] do 5&] do 5&依赖 +sum sum + c 6&basic word count algorithm&依赖 +sum sum + c 6&basic word count algorithm&依赖 +sum sum + c 6&basic word count algorithm&依赖 +sum sum + c 6&basic word count algorithm&依赖 +amount&intermediate datum&AGGREGATION +Section 2.4&output&依赖 +output&mapper&AGGREGATION +Section 2.4&mapper&依赖 +1 • in-mapper combine •&unique term&依赖 +1 • in-mapper combine •&key-value pair&依赖 +1 • in-mapper combine •&document 17 3.1&依赖 +term t 2 doc d&5&依赖 +Emit&entire document 6&依赖 +Emit&entire document 6&依赖 +Emit&entire document 6&依赖 +Emit&all term t&依赖 +Emit&all term t&依赖 +Emit&entire document 6&依赖 +2 h&7&依赖 +Emit&all term t&依赖 +Emit&all term t&依赖 +workings&detail&依赖 +workings&detail&依赖 +workings&algorithm critically&AGGREGATION +block&input count entire document mingshen sun ( cuhk ) mapreduce & hadoop word count&AGGREGATION +term t 2 doc d&6&依赖 +2 h&{ t } ) figure 3.3&依赖 +improved MapReduce word count algorithm&“ in-mapper combine ” design pattern&依赖 +Reducer&Figure 3.1&依赖 +Hadoop&example&依赖 +it&all&依赖 +Hadoop&guarantee&依赖 +execution framework&it&依赖 +execution framework&option&依赖 +In-Mapper Combiners • advantage&in-mapper combiner&AGGREGATION +local aggregation&place&依赖 +semantics&MapReduce&依赖 +semantics&contrast&依赖 +semantics&default combiner&AGGREGATION +Default combiner&map output&依赖 +• state&> potentially large memory overhead&依赖 +• state&mapper&依赖 +• algorithmic behavior&> potential order-dependent bug&依赖 +input keyvalue pair&> potential order-dependent bug&依赖 +input keyvalue pair&order&依赖 +• algorithmic behavior&order&依赖 +• with/without combiner&algorithm correctness •&依赖 +you&Java&依赖 +algorithm correctness •&0 , 1&依赖 +you&combiner class&依赖 +version 1 •&drawback&依赖 +•&reducer&依赖 +• i.e.&set combiner class&依赖 +• i.e.&reducer class 21 3.1&依赖 +r1 , r2&cnt 0 5&依赖 +r1 , r2&sum 0 4&依赖 +r1 , r2&] ) 3&依赖 +r1 , r2&cnt 0 5&依赖 +r1 , r2&] ) 3&依赖 +r1 , r2&sum 0 4&依赖 +mean&value&AGGREGATION +basic MapReduce algorithm&mean&依赖 +]&6&依赖 +MapReduce algorithm&algorithm&GENERALIZATION +]&basic MapReduce algorithm&依赖 +basic MapReduce algorithm&value&依赖 +value&mean&依赖 +value&mean&依赖 +value&value&依赖 +value&value&依赖 +mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 +mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 +mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 +we&concrete example&依赖 +mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 +mean ( 1 , 2 , 3 , 4 , 5 ) 6 = mean ( mean ( 1 , 2 )&basic MapReduce algorithm&依赖 +Version 1 • Mean&means&AGGREGATION +it&problem&依赖 +It&a problem&依赖 +It&Word Count problem&依赖 +optimization&algorithm 23 48 chapter 3&依赖 +correctness&algorithm 23 48 chapter 3&AGGREGATION +r 7&Emit&依赖 +r 7&1 8&依赖 +r 7&1 8&依赖 +r 7&( string t&依赖 +r 7&Emit&依赖 +r 7&pair ( sum&依赖 +r 7&pair ( sum&依赖 +r 7&( string t&依赖 +sum sum +&incorrect first attempt&依赖 +sum sum +&combiner&依赖 +mismatch&MapReduce programming model&依赖 +MapReduce programming model&programming model&GENERALIZATION +mismatch&MapReduce programming model&依赖 +sum sum +&value&依赖 +sum sum +&mean&依赖 +mismatch&MapReduce programming model&依赖 +We&complex key and value&依赖 +optimization&correctness&依赖 +correctness&algorithm&AGGREGATION +restriction&programming model&依赖 +optimization&algorithm&依赖 +combiner&value&依赖 +combiner&integer&依赖 +combiner&list&依赖 +output value type&reducer&AGGREGATION +list&integer&AGGREGATION +list&pair&AGGREGATION +it&mingshen sun ( cuhk ) mapreduce & hadoop computing&依赖 +it&Version 3 •&依赖 +cnt cnt + c 8&Mean&依赖 +cnt cnt + c 8&Mean&依赖 +cnt cnt + c 8&Mean&依赖 +cnt cnt + c 8&Mean&依赖 +integer ravg ) figure 3.6&value&依赖 +cnt cnt + c 8&Mean&依赖 +integer ravg ) figure 3.6&mean&依赖 +integer ravg ) figure 3.6&mingshen sun ( cuhk ) mapreduce & hadoop computing&依赖 +•&combiner&依赖 +25 50 chapter 3&25 50 chapter 3&依赖 +MapReduce algorithm&value&依赖 +MapReduce algorithm&mean&依赖 +2&10&依赖 +reducer&Figure 3.6 and one&依赖 +reducer&correct sum and count&依赖 +combiner&aggregate partial sum&依赖 +they&many time&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 +input key-value type&reducer&AGGREGATION +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer&依赖 +performance&computation&AGGREGATION +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 +output key-value type&combiner&AGGREGATION +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&依赖 +reducer mingshen sun ( cuhk ) mapreduce & hadoop pairs&input key-value type&依赖 +• m = n * n matrix ( n = number&unique word&AGGREGATION +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +[ j ] = number&time word w&AGGREGATION +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +26 mingshen sun ( cuhk ) mapreduce & hadoop a new run example • problem&time word w&依赖 +distributional profile&word&AGGREGATION +we&whole matrix&实现 +billion&word&AGGREGATION +single machine&MapReduce&依赖 +single machine&whole matrix •&依赖 +each co-occur word pair and integer 1 •&use&依赖 +•&aggregate partial count&依赖 +default combiner&combiner&GENERALIZATION +each co-occur word pair and integer 1 •&reducer&依赖 +• mapper&partial count&依赖 +28 mingshen sun ( cuhk ) mapreduce & hadoop pairs&approach&依赖 +28 mingshen sun ( cuhk ) mapreduce & hadoop pairs&approach&依赖 +we&default combiner&依赖 +Our&approach& +term w 2 doc d&4&依赖 +2 neighbor and ( w&5&依赖 +[ c1&s 0 4&依赖 +[ c1&s 0 4&依赖 +[ c1&s 0 4&依赖 +[ c1&s 0 4&依赖 +Sum co-occurrence&figure 3.8 ( pair p&依赖 +Sum co-occurrence&count s&依赖 +Sum co-occurrence&( pair p&依赖 +h1 , h2 , h3&Hf new AssociativeArray 4&依赖 +h1 , h2 , h3&] ) 3&依赖 +h1 , h2 , h3&] ) 3&依赖 +h1 , h2 , h3&Hf new AssociativeArray 4&依赖 +] do 5&sum ( hf&依赖 +huge • both pair and stripe&in-mapper combine 34 mingshen sun ( cuhk ) mapreduce & hadoop pairs v.s.&依赖 +better use&combiners • Con&AGGREGATION +Memory size&associative array&AGGREGATION +) percentage ( second&approach figure 3.10&依赖 +) percentage ( second&stripe&依赖 +) percentage ( second&stripe&依赖 +time&” algorithm&AGGREGATION +) percentage ( second&" approach " pair&依赖 +) percentage ( second&" approach " pair&依赖 +) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 +) percentage ( second&stripe&依赖 +) percentage ( second&stripe&依赖 +) percentage ( second&approach figure 3.10&依赖 +) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 +approach figure 3.10&apw corpus r2 = 0.992 r2 = 0.999&AGGREGATION +) percentage ( second&approach figure 3.10&依赖 +) percentage ( second&" approach " pair&依赖 +word cooccurrence matrix&APW corpus&AGGREGATION +di ↵ erent fraction&APW corpus&AGGREGATION +) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 +) percentage ( second&apw corpus r2 = 0.992 r2 = 0.999&依赖 +) percentage ( second&approach figure 3.10&依赖 +) percentage ( second&" approach " pair&依赖 +experiment&19 slave and ###&依赖 +experiment&Hadoop cluster&依赖 +experiment&19 slave and&依赖 +“&” • estimate relative frequency&依赖 +“&counts •&依赖 +we&MapReduce&依赖 +we&problem&依赖 +drawback&co-occurrence count&AGGREGATION +counts •&•&GENERALIZATION +other&itself&依赖 +36 Relative Frequencies Drawback&co-occurrence count&AGGREGATION +good ”&“ hello world ” estimate relative frequency&依赖 +good ”&MapReduce&依赖 +we&problem&依赖 +31&count& +use&partitioner&GENERALIZATION +sure same word&same reducer ( use and null&依赖 +sure same word&partitioner&依赖 +) • reducer&state&依赖 +order&key&AGGREGATION +• MapReduce&key&依赖 +• MapReduce&MapReduce&GENERALIZATION +• MapReduce&you&依赖 +• MapReduce&sort 39 mingshen sun ( cuhk ) mapreduce & hadoop order inversion&依赖 +• MapReduce&order&依赖 +• MapReduce&Idea •&依赖 +design pattern&order inversion&AGGREGATION +Idea •&•&GENERALIZATION +what&value&依赖 +individual count&same reducer • Preserve state&依赖 +sort order&intermediate key&AGGREGATION +computation&marginal • Control&AGGREGATION +individual count&reducer&依赖 +reading&reducer • buffer value&依赖 +• naive solution •&v ) •&依赖 +• naive solution •&> ( t&依赖 +reading&memory&依赖 +reading&id&依赖 +• naive solution •&• id&依赖 +• sensors record temperature&temperature v ) 41 mingshen sun ( cuhk ) mapreduce & hadoop secondary&依赖 +• sensors record temperature&temperature v ) 41 mingshen sun ( cuhk ) mapreduce & hadoop secondary&依赖 +reading&sensor id&AGGREGATION +•&processing • anything&依赖 +•&multiple key-value pair&依赖 +•&state&依赖 +• value-to-key conversion •&• ( id&依赖 +sorting&43 mingshen sun ( cuhk ) mapreduce & hadoop tools&依赖 +in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 +in-mapper combine • ram vs. disk&• main idea&依赖 +in-mapper combine • ram vs. disk&• main idea&依赖 +reducers process key&• Control order&依赖 +key&local aggregation&依赖 +reducer process&which&依赖 +sorting&Synchronization • Cleverly-constructed data structure&依赖 +44 mingshen sun ( cuhk ) mapreduce & hadoop issues and tradeoffs • number&key-value pair&AGGREGATION +reducers process key&Scale • Works&依赖 +sorting&reducer ( 43 mingshen sun ( cuhk ) mapreduce & hadoop tools&依赖 +in-mapper combine • ram vs. disk&• main idea&依赖 +reducers process key&small dataset&依赖 +in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 +reducers process key&Partitioner • Control&依赖 +in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 +reducer process&a big difference • combiner&依赖 +reducers process key&Partitioner • Control&依赖 +in-mapper combine • ram vs. disk&• main idea&依赖 +key&local aggregation&依赖 +data together • Sort order&intermediate key&AGGREGATION +in-mapper combine • ram vs. disk&in-mapper combine • ram vs. disk&依赖 +network • Size&each key-value pair • de/serialization overhead&AGGREGATION +• Memory management issue&mangled input record&依赖 +• Memory management issue&mangled input record&依赖 +list&posting&AGGREGATION +• Each term&posting&依赖 +• Each term&list&依赖 +document id&id&GENERALIZATION +• Each post&document id&依赖 +INVERTED INDEXING&1 4 11 19 figure 4.1&依赖 +INVERTED INDEXING&1 4 11 19 figure 4.1&依赖 +Simple illustration&inverted index&AGGREGATION +term&list posting&依赖 +postings list&list&GENERALIZATION +front&postings list&AGGREGATION +auxiliary data structure&integer document id&依赖 +auxiliary data structure&mapping&依赖 +retrieval&postings list&依赖 +large •&MapReduce&依赖 +large •&MapReduce&依赖 +large •&MapReduce&依赖 +“&” )&AGGREGATION +54 mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation •&goal&依赖 +Our&goal& +54 mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation •&construct&依赖 +• actual document content ( value&What&依赖 +• Analyze&each document and extract useful term&依赖 +reducer&What&依赖 +reducer&term&依赖 +reducer&• Aggregates all observed posting&依赖 +2 H&posting p ) mingshen sun ( cuhk ) mapreduce & hadoop baseline implementation 57 4.4&依赖 +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +implementation 77 one fish&inverted indexing&依赖 +implementation 77 one fish&aggregate value&依赖 +implementation 77 one fish&aggregate value&依赖 +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +input&view •&依赖 +view •&•&GENERALIZATION +input&’s point&依赖 +posting&term •&AGGREGATION +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +implementation 77 one fish&inverted indexing&依赖 +• Reducer&list (&依赖 +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&aggregate value&依赖 +• Reducer&Reducer&GENERALIZATION +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +reducer&point& +list&term&AGGREGATION +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +input&term&依赖 +implementation 77 one fish&inverted indexing&依赖 +implementation 77 one fish&aggregate value&依赖 +implementation 77 one fish&aggregate value&依赖 +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&inverted indexing&依赖 +input&’s point&依赖 +• Reducer&need )&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +implementation 77 one fish&aggregate value&依赖 +input&term&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +1 red d2 1 d3 1 d3 1 simple illustration&baseline inverted indexing algorithm&AGGREGATION +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +input&view •&依赖 +implementation 77 one fish&inverted indexing&依赖 +implementation 77 one fish&inverted indexing&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +task&reducer&AGGREGATION +implementation 77 one fish&inverted indexing&依赖 +1 red d2 1 d3 1 d3 1 simple illustration&large , distributed group&依赖 +implementation 77 one fish&aggregate value&依赖 +implementation 77 one fish&2 one red bird doc 3 mapper mapper mapper fish&依赖 +implementation 77 one fish&aggregate value&依赖 +’s point&view •&AGGREGATION +implementation 77 one fish&2 red d2 1 bird d3 1 one d3 1 red d3 1 reducer shuffle and sort&依赖 +implementation 77 one fish&inverted indexing&依赖 +MapReduce framework&most heavy lifting&依赖 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&实现 +Doc&hat& +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&依赖 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&依赖 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +Positional&fish& +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&依赖 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&实现 +two fish doc&aggregate value&依赖 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&aggregate value&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +two fish doc&3 fish 1 2 one 1 two 1 red 2 cat 3 blue 2 hat 3 shuffle and sort&实现 +5&2 H&依赖 +5&2 H&依赖 +5&2 H&依赖 +5&2 H&依赖 +5&2 H&依赖 +5&2 H&依赖 +hn1 , f1us , hn2 and f2us&P new List 4&依赖 +hn1 , f1us , hn2 and f2us&] ) 3&依赖 +] do 5&] do 5&依赖 +] do 5&sort ( p ) 7&依赖 +] do 5&] do 5&依赖 +] do 5&] do 5&依赖 +] do 5&sort ( p ) 7&依赖 +] do 5&sort ( p ) 7&依赖 +] do 5&sort ( p ) 7&依赖 +] do 5&baseline inverted indexing algorithm&AGGREGATION +• Reducer&sufficient memory&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop scalability issue • assumption&baseline implementation&AGGREGATION +• Reducer&posting&依赖 +reducer first buffer&posting line 5 )&依赖 +reducer first buffer&line 5 )&依赖 +Key idea&MapReduce framework&依赖 +tuple&same reducer&依赖 +63 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation 64 4.5&62 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation •&依赖 +63 mingshen sun ( cuhk ) mapreduce & hadoop revise implementation 64 4.5&disk directly • caution&依赖 +you&customized partitioner&依赖 +tuple&same reducer&依赖 +2 h&scalable inverted indexing algorithm&依赖 +7&scalable inverted indexing algorithm&AGGREGATION +2 h&MapReduce&依赖 +2 h&MapReduce&依赖 +2 h&7&依赖 +2 h&scalable inverted indexing algorithm&依赖 +you&MapReduce&依赖 +you&graph&依赖 +70 mingshen sun ( cuhk ) mapreduce & hadoop graph representations • two common representation&71 5.1&依赖 +linear algebra • easy algorithmic implementation • large memory space and esp&linear algebra • easy algorithmic implementation • large memory space and esp&依赖 +graph • shortest mean smallest hop count&minimum hop&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +72 mingshen sun ( cuhk ) mapreduce & hadoop dijkstra&’s algorithm&依赖 +graph • shortest mean smallest hop count&minimum hop&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +source node&node&GENERALIZATION +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&shortest path&依赖 +72 mingshen sun ( cuhk ) mapreduce & hadoop dijkstra&’s algorithm&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +mingshen sun ( cuhk ) mapreduce & hadoop single-source shortest path&source node&依赖 +Dijkstra&algorithm& +graph • shortest mean smallest hop count&minimum hop&依赖 +graph • shortest mean smallest hop count&graph • shortest mean smallest hop count&依赖 +Figure 5.3&’s algorithm&AGGREGATION +GRAPH algorithm&) 8 13 10 1 n2 n4 8 9 10 1 n2 n4 8 9 10 1 n2 n4 0 5 7 5 2 3 9 7 4 6 n1 0 5 7 5 2 3 9 7 4 6 n1 0 5 7 5 2 3 9 7 4 6 n1 2 n3 n5 2 n3 n5 2 n3 n5&依赖 +GRAPH algorithm&indicate&依赖 +GRAPH algorithm&5 ∞ 2 7 1 n3 n5 5 7 2 7 1 n3 n5 (&依赖 +a ) – (&running&依赖 +a ) – (&algorithm&依赖 +a ) – (&running&依赖 +a ) – (&running&依赖 +a ) – (&algorithm&依赖 +a ) – (&algorithm&依赖 +running&algorithm&AGGREGATION +a ) – (&running&依赖 +a ) – (&algorithm&依赖 +78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&adjacency list )&依赖 +many iteration¶llel bfs ( equal edge weight case )&依赖 +78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&( n&依赖 +78 mingshen sun ( cuhk ) mapreduce & hadoop bfs pseudo-code 79 mingshen sun ( cuhk ) mapreduce & hadoop stopping criterion •&emit ( n&依赖 +driver program check&counter value&依赖 +driver program check&counter value&依赖 +driver program check&counter value&依赖 +many iteration¶llel bf&依赖 +amount&time&AGGREGATION +captures notion&page importance&AGGREGATION +random jump • n&graph 84 ↵ pr&依赖 +• One&thousand&AGGREGATION +total number&node&AGGREGATION +probability&random jump • n&AGGREGATION +thousand&feature&AGGREGATION +out-degree&t •&AGGREGATION +• c ( t )&t •&依赖 +random jump • n&node&依赖 +t •&•&GENERALIZATION diff --git a/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls b/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls deleted file mode 100644 index 6a6f6160f576ae388ad7590c4c4f77b9dcfac6e7..0000000000000000000000000000000000000000 Binary files a/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls and /dev/null differ diff --git a/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls b/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls deleted file mode 100644 index a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64..0000000000000000000000000000000000000000 Binary files a/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls and /dev/null differ diff --git a/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls b/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls deleted file mode 100644 index a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64..0000000000000000000000000000000000000000 Binary files a/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls and /dev/null differ diff --git a/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java b/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java index bd5ad1f71249da2b95ea31fdce0a66c7051b52ae..e6786eb6fea7514a3410374fc395fc3baf4f281a 100644 --- a/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java +++ b/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java @@ -61,6 +61,14 @@ public class CodeRelationMapperTest { */ private static final String res_dir = System.getProperty("user.dir") + "\\src\\test\\resources\\cdtocode\\"; + @Test + public void getName() { + String dir = "D:\\eclipse-committers\\Apache OODT File Manager\\src\\main\\java\\org\\apache\\oodt\\cas\\filemgr\\datatransfer\\TransferStatusTracker.java"; + System.out.println(dir.lastIndexOf("\\")); + System.out.println(dir.lastIndexOf(".")); + System.out.println(dir.substring(dir.lastIndexOf("\\") + 1, dir.lastIndexOf("."))); + } + @Test public void write() { try {