diff --git a/pom.xml b/pom.xml index 10d9fd7fff8d6c024c85c033cce25ebec6299e02..9498ceef3af46c447e812060b3c08cdb685ce849 100644 --- a/pom.xml +++ b/pom.xml @@ -81,5 +81,11 @@ 4.2.2 models-chinese + + + net.sourceforge.jexcelapi + jxl + 2.6.12 + \ No newline at end of file diff --git a/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java b/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java index a2f336ba1c3092a89bfbad1c17aaf0906a3eb019..d6ae3f901451c0a906cf1dd8a022458c83fd46cd 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java +++ b/src/main/java/com/hy/java/uct/cdtocode/CDToCodeTracer.java @@ -27,6 +27,11 @@ public class CDToCodeTracer { */ private static final String code_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\code\\"; + /** + * 将追踪结果放在res_dir目录下 + */ + private static final String res_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\"; + /** * 针对类图及其相关文档,追踪到代码中的类 */ @@ -35,16 +40,44 @@ public class CDToCodeTracer { * 1、读取模型信息 */ // 读取完UML图识别结果后,将实体信息保存在classes_in_CD里。形式为 - Map classes_in_CD = CDReader.read(cd_dir + "cd-eclipse_jetty.txt"); + /* + * Apache OODT File Manager + */ + Map classes_in_CD = CDReader.read(cd_dir + "cd-Apache OODT File Manager.txt"); + /* + * Hadoop HDFS + */ + /* + * Hadoop MapReduce + */ // 检查结果,可注释掉 // CDReader.check(classes_in_CD); /* * 2、读取文档信息 + * + * 做“自己方法内有无文档的对比”的实验时,在不导入任何文档即可 */ List doc_dir_ls = new ArrayList<>(); // 在这儿添加多个文件 - doc_dir_ls.add(doc_dir + "jetty\\basic-architecture.adoc"); - doc_dir_ls.add(doc_dir + "jetty\\Jetty10 Operations Guide _ The Eclipse Foundation.txt"); + /* + * Apache OODT File Manager + */ + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\cas-filemgr – CAS File Manager Developer Guide.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Catalog and Archive File Management Component.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\File Manager Scale Out Planning - OODT - Apache Software Foundation.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Interface Ingester.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Mahasen Distributed Storage Resource Broker.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\OODT Filemgr User Guide.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\Package org.apache.oodt.cas.filemgr.cli.action.txt"); + doc_dir_ls.add(doc_dir + "Apache OODT File Manager\\React file manager.txt"); + /* + * Hadoop HDFS + */ + /* + * Hadoop MapReduce + */ // 实际使用的Map,保存每份文档地址及其内容 Map> dir_sentences_map = DocReader.readDocs(doc_dir_ls); /* @@ -52,6 +85,7 @@ public class CDToCodeTracer { * * <类全称(包+类名), java_file_path> */ + // 记得改这里面写的路径 Map classFullName_javaFileDir_map = CodeReader.read(code_dir + "code path"); // 检查结果,可注释掉 // CodeReader.check(classFullName_javaFileDir_map); @@ -75,7 +109,19 @@ public class CDToCodeTracer { * 基于启发式模糊匹配的UML类图与代码追踪方法:首先针对类图中的类和代码中的类,基于类的名称进行字符串完全匹配,从而建立确定的初始追踪;基于同义词、 词缀词典等语料库,定义基于类名称匹配和关联关系的启发式匹配规则,研究基于模糊匹配技术的追踪关系建立方法,基于初始追踪和启发式规则, 对类名不一致的模型和代码元素进行启发式追踪,扩展初始追踪关系。 */ Map mapped_classes = CodeRelationMapper.map(classShortName_classObj_mappedByDoc, classFullName_javaFileDir_map); + /* + * Apache OODT File Manager + */ + CodeRelationMapper.save(mapped_classes, res_dir + "Apache OODT File Manager.xls"); + /* + * Hadoop HDFS + */ + // CodeRelationMapper.save(mapped_classes, res_dir + "Hadoop HDFS.xls"); + /* + * Hadoop MapReduce + */ + // CodeRelationMapper.save(mapped_classes, res_dir + "Hadoop MapReduce.xls"); // 检查结果,可注释掉 - CodeRelationMapper.check(mapped_classes); + // CodeRelationMapper.check(res_dir + "Apache OODT File Manager.xls"); } } diff --git a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java b/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java new file mode 100644 index 0000000000000000000000000000000000000000..d35e4514e49f52499d743ceabca69c7625f2ef08 --- /dev/null +++ b/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIDataGenerator.java @@ -0,0 +1,9 @@ +package com.hy.java.uct.cdtocode; + +public class VSMAndLSIDataGenerator { + + public static void main(String[] args) { + // TODO Auto-generated method stub + + } +} diff --git a/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java b/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java new file mode 100644 index 0000000000000000000000000000000000000000..0242f12dc821b109a411f93ffdf13cf1ab8f6d03 --- /dev/null +++ b/src/main/java/com/hy/java/uct/cdtocode/VSMAndLSIParser.java @@ -0,0 +1,12 @@ +package com.hy.java.uct.cdtocode; + +/** + * 解析VSM和LSI结果文件(csv文件) + */ +public class VSMAndLSIParser { + + public static void main(String[] args) { + // TODO Auto-generated method stub + + } +} diff --git a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java index 18f5857b70c10c4d0674b770db266837e68b17b6..58514846ce2edd48b1769bbf4e1ef59771aef6db 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java +++ b/src/main/java/com/hy/java/uct/cdtocode/mapper/CodeRelationMapper.java @@ -2,8 +2,11 @@ package com.hy.java.uct.cdtocode.mapper; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -27,18 +30,29 @@ import com.hy.java.uct.util.EntityRelation; import com.hy.java.uct.util.UMLClass; import com.hy.java.utility.common.Pair; +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; +import jxl.write.Label; +import jxl.write.WritableSheet; +import jxl.write.WritableWorkbook; +import jxl.write.WriteException; +import jxl.write.biff.RowsExceededException; + public class CodeRelationMapper { /** * 对扩展后的模型与代码进行匹配 */ public static Map map(Map classShortName_classObj_mappedByDoc, Map classFullName_javaFileDir_map) { + System.out.println("开始对扩展后的模型与代码进行匹配"); Set ClsImg_shortName_set = classShortName_classObj_mappedByDoc.keySet(); /* * 针对每个包含文档信息的实体,将其追踪到代码中的类 * * 模糊匹配:根据Ent_doc的类名、属性、方法,匹配代码中可能追踪到的类Cls_code,并计算追踪概率Sim(Ent_doc,Cls_code) */ + System.out.println("开始基于模糊匹配,将Cls_img追踪到代码"); for (String ClsImg_shortName : ClsImg_shortName_set) { UMLClass UML_class = classShortName_classObj_mappedByDoc.get(ClsImg_shortName); // 对于每个UML_class,将其追踪到的每个Ent_doc,都追踪到代码(即Cls_code) @@ -51,8 +65,11 @@ public class CodeRelationMapper { mapEntToCode(related_Ent, classFullName_javaFileDir_map); } } + System.out.println("对" + UML_class.getTitle() + "的模糊匹配完成。匹配完所有类后,还需基于关系推理取舍追踪"); } + System.out.println("完成基于模糊匹配,将Cls_img追踪到代码"); // 检查Sim(Ent_doc,Cls_code),可注释掉 + // 做“处理完文档后,对有无关系推理做对比”的实验时,可用check_SimEC()方法的结果作为“无推理”的结果 // check_SimEC(classShortName_classObj_mappedByDoc); /* * 基于关系推理的取舍 @@ -65,7 +82,9 @@ public class CodeRelationMapper { * * 进行完这步后,对Cls_img的每个Ent_doc而言,会将其追踪到数个truly_mapped_file,每个truly_mapped_file均对应数个PE_code(Ent_doc,Cls_code,R) */ + System.out.println("开始基于关系推理,对模糊匹配的追踪结果进行取舍"); detectUMLCLsRelations(classShortName_classObj_mappedByDoc, classFullName_javaFileDir_map); + System.out.println("完成基于关系推理,对模糊匹配的追踪结果进行取舍"); // 检查PE_code(Ent_doc,Cls_code,R),可注释掉 // check_PEcodeR(classShortName_classObj_mappedByDoc); /* @@ -78,6 +97,7 @@ public class CodeRelationMapper { getInitTraceResults(classShortName_classObj_mappedByDoc); // 此时UML_class.duplicated_mapped_javaFile_ls可能包含重复结果,所以还需过滤一遍 filterTraceResults(classShortName_classObj_mappedByDoc); + System.out.println("完成对扩展后的模型与代码进行匹配"); return classShortName_classObj_mappedByDoc; } @@ -133,7 +153,7 @@ public class CodeRelationMapper { String name1_lower = name1.toLowerCase(); String name2_lower = name2.toLowerCase(); // 判断处理后的两个String是否相似 - if (name2_lower.contains(name1_lower)) { + if (sim_EntDocName_ClsCodeName(name1_lower, name2_lower) > 0.5) { return true; } } @@ -316,9 +336,13 @@ public class CodeRelationMapper { try { CompilationUnit possibleMapped_ClsCode = StaticJavaParser.parse(new File(classFullName_javaFileDir_map.get(possibleMapped_javaFile_pair.getLeft()))); String cls_shortName = getClsShortNameFromFullName(possibleMapped_javaFile_pair.getLeft()); - // 如果一个Cls_code与某个由related_Ent追踪到的Cls_code'有同类关系,则该Cls_code就是“根据关系推理追踪到的代码文件”。 - // 注意: 如果Ent_code没有related_Ent,则直接认为Ent_code能纯粹根据文本相似度追踪到Cls_code + /* + * 如果一个Cls_code与某个由related_Ent追踪到的Cls_code'有同类关系,则该Cls_code就是“根据关系推理追踪到的代码文件”。 + * + * 注意: 如果Ent_code没有related_Ent,则直接认为Ent_code能纯粹根据文本相似度追踪到Cls_code + */ Pair> ifTrulyMapped_possibleRs = check_if_trulyMapped(possibleMapped_ClsCode, cls_shortName, possibleMapped_javaFile_pair.getLeft(), Ent_doc.relations_between_Ent); + // 如果确实真的可以这么追踪,则记录这个文件,并计算追踪概率 if (ifTrulyMapped_possibleRs.getLeft() == true) { // 记录这个“根据关系推理追踪到的代码文件”。 MappedFile truly_mapped_file_forEnt = new MappedFile(); @@ -376,8 +400,18 @@ public class CodeRelationMapper { truly_mapped = checkImpl(possibleMapped_ClsCode, related_Ent.possibleMapped_javaFiles, cls_shortName); break; } + // 由于一般情况下,图中只有依赖这种关系,所以此时不检测关系类型是否相同 case "依赖": { truly_mapped = checkDepend(possibleMapped_ClsCode, related_Ent.possibleMapped_javaFiles); + if (truly_mapped == false) { + truly_mapped = checkExtend(possibleMapped_ClsCode, related_Ent.possibleMapped_javaFiles, cls_shortName); + } + if (truly_mapped == false) { + truly_mapped = checkImpl(possibleMapped_ClsCode, related_Ent.possibleMapped_javaFiles, cls_shortName); + } + if (truly_mapped == false) { + truly_mapped = checkAggr(related_Ent.possibleMapped_javaFiles, cls_fullName); + } break; } case "聚合": { @@ -402,7 +436,7 @@ public class CodeRelationMapper { } /** - * 如果是继承关系,则判断继承目标是否能与某个related_Ent.mappedFile对上 + * 如果是继承关系,则判断其继承目标是否能与某个related_Ent.mappedFile对上 */ private static boolean checkExtend(CompilationUnit possibleMapped_ClsCode, List> relatedEnt_possibleMapped_javaFiles, String cls_shortName) { boolean res = false; @@ -589,6 +623,9 @@ public class CodeRelationMapper { UMLClass UML_class = classShortName_classObj_mappedByDoc.get(ClsImg_shortName); Map temp = new HashMap<>(); for (MappedFile mapped_javaFile_inCls : UML_class.duplicated_mapped_javaFile_ls) { + // 首先,先对mapped_javaFile_inCls自己追踪到的javaFile去重 + filterMapped_javaFile_inCls(mapped_javaFile_inCls); + // 其次,再进行两两比对 if (!temp.containsKey(mapped_javaFile_inCls.java_file_dir)) { temp.put(mapped_javaFile_inCls.java_file_dir, mapped_javaFile_inCls); } else { @@ -605,36 +642,81 @@ public class CodeRelationMapper { } } + /** + * 对mapped_javaFile_inCls自己追踪到的javaFile去重 + */ + private static void filterMapped_javaFile_inCls(MappedFile mapped_javaFile_inCls) { + Map>> cluster_map = new HashMap<>(); + Set> Ps = new HashSet<>(); + // 先分堆 + for (Pair P : mapped_javaFile_inCls.P_ls) { + if (P.getLeft() == null) { + if (cluster_map.containsKey("这是null")) { + cluster_map.get("这是null").add(P); + } else { + List> list = new ArrayList<>(); + list.add(P); + cluster_map.put("这是null", list); + } + } else { + if (cluster_map.containsKey(P.getLeft().related_ent.name)) { + cluster_map.get(P.getLeft().related_ent.name).add(P); + } else { + List> list = new ArrayList<>(); + list.add(P); + cluster_map.put(P.getLeft().related_ent.name, list); + } + } + } + // 然后同堆内取概率最大的 + Collection>> clusters = cluster_map.values(); + for (List> cluster : clusters) { + Pair P = Pair.createPair(cluster.get(0).getLeft(), cluster.get(0).getRight()); + for (Pair temp_P : cluster) { + if (P.getRight() < temp_P.getRight()) { + P.setRight(temp_P.getRight()); + } + } + Ps.add(P); + } + // 然后将res_map转存到mapped_javaFile_inCls.P_ls中 + mapped_javaFile_inCls.P_ls.clear(); + for (Pair P : Ps) { + mapped_javaFile_inCls.P_ls.add(P); + } + } + /** * 对比mapped_javaFile_inTemp和mapped_javaFile_inCls,合并两者的“基于关系的追踪” */ private static MappedFile mergeTwoMappedFiles(MappedFile mapped_javaFile_inTemp, MappedFile mapped_javaFile_inCls) { MappedFile res = new MappedFile(); res.java_file_dir = mapped_javaFile_inTemp.java_file_dir; + // 开始合并两个MappedFile的关系 for (Pair P_inTemp : mapped_javaFile_inTemp.P_ls) { - Pair P = P_inTemp; + Pair P_inRes = Pair.createPair(P_inTemp.getLeft(), P_inTemp.getRight()); // 接下来的比较类似排序:比较P与mapped_javaFile_inCls的P_inCls,如果两者具有相同的关系,则取概率大的 for (Pair P_inCls : mapped_javaFile_inCls.P_ls) { - if (P.getLeft() != null && P_inCls.getLeft() != null) { - if (P.getLeft().related_ent.name.equals(P_inCls.getLeft().related_ent.name) && P.getLeft().relation_type.equals(P_inCls.getLeft().relation_type)) { + if (P_inRes.getLeft() != null && P_inCls.getLeft() != null) { + if (P_inRes.getLeft().related_ent.name.equals(P_inCls.getLeft().related_ent.name) && P_inRes.getLeft().relation_type.equals(P_inCls.getLeft().relation_type)) { P_inCls.getLeft().should_be_del = true; // 取概率大的 - if (P.getRight() < P_inCls.getRight()) { - P = P_inCls; + if (P_inRes.getRight() < P_inCls.getRight()) { + P_inRes.setRight(P_inCls.getRight()); } } - } else if (P.getLeft() == null && P_inCls.getLeft() == null) { + } else if (P_inRes.getLeft() == null && P_inCls.getLeft() == null) { // 如果P_inCls没有EntityRelation,则做一个假的EntityRelation用于标记该P_inCls在后面需被忽略掉 EntityRelation null_er = new EntityRelation(); null_er.should_be_del = true; P_inCls.setLeft(null_er); // 取概率大的 - if (P.getRight() < P_inCls.getRight()) { - P = P_inCls; + if (P_inRes.getRight() < P_inCls.getRight()) { + P_inRes.setRight(P_inCls.getRight()); } } } - res.P_ls.add(P); + res.P_ls.add(P_inRes); } // 然后再把mapped_javaFile_inCls里没检测到的P_inCls都添加上 for (Pair P_inCls : mapped_javaFile_inCls.P_ls) { @@ -657,26 +739,145 @@ public class CodeRelationMapper { } /** - * 检查追踪结果 + * 保存追踪结果 + * + * 类,追踪到的代码,追踪概率,参考关系,参考关系的目标 */ - public static void check(Map mapped_classes) { - Set ClsImg_shortName_set = mapped_classes.keySet(); - for (String ClsImg_shortName : ClsImg_shortName_set) { - System.out.println("======================================================"); - UMLClass UML_class = mapped_classes.get(ClsImg_shortName); - if (UML_class.mapped_javaFile_ls.size() > 0) { - for (MappedFile mapped_javaFile : UML_class.mapped_javaFile_ls) { - for (Pair P : mapped_javaFile.P_ls) { - if (P.getLeft() != null) { - System.out.println(UML_class.getTitle() + "有" + P.getRight() + "的概率追踪到代码中的" + mapped_javaFile.java_file_dir + ",参考其与文档实体" + P.getLeft().related_ent.name + "的" + P.getLeft().relation_type + "关系。"); - } else { - System.out.println(UML_class.getTitle() + "有" + P.getRight() + "的概率追踪到代码中的" + mapped_javaFile.java_file_dir + ",这条追踪是没有相关关系的。"); + public static void save(Map mapped_classes, String res_dir) { + try { + // 工作簿 + WritableWorkbook workbook = Workbook.createWorkbook(new File(res_dir)); + if (workbook != null) { + // 新建第一个工作表 + WritableSheet sheets = workbook.createSheet("Sheet1", 0); + // 构建工作表的表头 + Label label1 = new Label(0, 0, "类"); + sheets.addCell(label1); + Label label2 = new Label(1, 0, "追踪到的代码"); + sheets.addCell(label2); + Label label3 = new Label(2, 0, "追踪概率"); + sheets.addCell(label3); + Label label4 = new Label(3, 0, "参考关系类型"); + sheets.addCell(label4); + Label label5 = new Label(4, 0, "参考关系的目标"); + sheets.addCell(label5); + // 从第二行开始,保存每个类的追踪结果 + Set ClsImg_shortName_set = mapped_classes.keySet(); + int row = 1; + for (String ClsImg_shortName : ClsImg_shortName_set) { + UMLClass UML_class = mapped_classes.get(ClsImg_shortName); + // 保存追踪结果 + if (UML_class.mapped_javaFile_ls.size() > 0) { + for (MappedFile mapped_javaFile : UML_class.mapped_javaFile_ls) { + for (Pair P : mapped_javaFile.P_ls) { + if (P.getLeft() != null) { + // 类 + Label _class = new Label(0, row, UML_class.getTitle()); + sheets.addCell(_class); + // 追踪到的代码 + Label code = new Label(1, row, mapped_javaFile.java_file_dir); + sheets.addCell(code); + // 追踪概率 + Label ratio = new Label(2, row, P.getRight().toString()); + sheets.addCell(ratio); + // 参考关系类型 + Label ref_relation = new Label(3, row, P.getLeft().relation_type); + sheets.addCell(ref_relation); + // 参考关系的目标 + Label ref_relation_target = new Label(4, row, P.getLeft().related_ent.name); + sheets.addCell(ref_relation_target); + row++; + } else { + // 类 + Label _class = new Label(0, row, UML_class.getTitle()); + sheets.addCell(_class); + // 追踪到的代码 + Label code = new Label(1, row, mapped_javaFile.java_file_dir); + sheets.addCell(code); + // 追踪概率 + Label ratio = new Label(2, row, P.getRight().toString()); + sheets.addCell(ratio); + // 参考关系类型 + Label ref_relation = new Label(3, row, null); + sheets.addCell(ref_relation); + // 参考关系的目标 + Label ref_relation_target = new Label(4, row, null); + sheets.addCell(ref_relation_target); + row++; + } + } } } + // 当前类没有追踪到对应的代码实现 + else { + // 类 + Label _class = new Label(0, row, UML_class.getTitle()); + sheets.addCell(_class); + // 追踪到的代码 + Label code = new Label(1, row, null); + sheets.addCell(code); + // 追踪概率 + Label ratio = new Label(2, row, null); + sheets.addCell(ratio); + // 参考关系类型 + Label ref_relation = new Label(3, row, null); + sheets.addCell(ref_relation); + // 参考关系的目标 + Label ref_relation_target = new Label(4, row, null); + sheets.addCell(ref_relation_target); + row++; + } + } + // 写入文件 + workbook.write(); + workbook.close(); + } + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + /** + * 检查追踪结果 + */ + public static void check(String res_dir) { + try { + // 工作簿 + Workbook book = Workbook.getWorkbook(new File(res_dir)); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("Sheet1"); + // Sheet sheet = book.getSheet(0); + int rows = sheet.getRows(); + for (int row = 1; row < rows; row++) { + String _class = sheet.getCell(0, row).getContents(); + String code = sheet.getCell(1, row).getContents(); + String ratio = sheet.getCell(2, row).getContents(); + String ref_relation = sheet.getCell(3, row).getContents(); + String ref_relation_target = sheet.getCell(4, row).getContents(); + if (code != null) { + if (ref_relation != null) { + System.out.println(_class + "有" + ratio + "的概率追踪到代码中的" + code + ",参考其与文档实体" + ref_relation_target + "的" + ref_relation + "关系。"); + } else { + System.out.println(_class + "有" + ratio + "的概率追踪到代码中的" + code + ",这条追踪是没有相关关系的。"); + } + } else { + System.out.println(_class + "没有追踪到对应的代码实现。"); } - } else { - System.out.println(UML_class.getTitle() + "没有追踪到对应的代码实现。"); } + book.close(); + } catch (BiffException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); } } } diff --git a/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java b/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java index f42113cf0caa4f6a9f323e1e33a6e26fde8b6fcc..c8e2b8b20c794b941f4780a1e6efcd1da3c9b924 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java +++ b/src/main/java/com/hy/java/uct/cdtocode/mapper/DocAnalyzer.java @@ -1,13 +1,13 @@ package com.hy.java.uct.cdtocode.mapper; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.collections4.SetUtils; @@ -19,6 +19,10 @@ import com.hy.java.uct.util.UMLClass; import com.hy.java.utility.common.FileEditor; import com.hy.java.utility.common.Pair; +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; + /** * 分析文档信息。实际相当于增加类图中的UMLclass、类本身的内容、类之间关系 * @@ -40,10 +44,12 @@ public class DocAnalyzer { * dir_sentences_map的形式:<文档地址,文档句子列表> */ public static Map analyze(Map classes_in_CD, Map> dir_sentences_map) { + System.out.println("开始分析文档信息"); // 基于启发式匹配规则(模糊匹配和词性分析)和概率的追踪 classInCD_to_entInDoc(classes_in_CD, dir_sentences_map); // 基于规则匹配和概率分析提取类之间关系 analyze_relationBetweenEnts(classes_in_CD, dir_sentences_map); + System.out.println("完成分析文档信息"); return classes_in_CD; } @@ -65,6 +71,8 @@ public class DocAnalyzer { * 1、预处理赵子岩的相似词结果 * * 将相似词结果整理为clsImg_EntDoc_justZhao,形式为>> + * + * 子岩把所有文件的结果都合并到一个文件里了。所以把子岩的结果重命名为第一个文件,其他文件置空即可。 */ Map>> clsImg_EntDoc_justZhao = new HashMap<>(); // 先将类图中的类名都填充进去 @@ -73,13 +81,18 @@ public class DocAnalyzer { } // 对每个相似词文件,遍历其中的<原词,相似词>对,以原词为key保存相似词。相似度为Jaccard相似度 for (String doc_dir : docDir_set) { + // 处理一下赵子岩的文件,格式化为"原词,相似词"对 + processSimDoc(doc_dir); FileEditor simEnts_doc = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-simEnts.txt"); List similar_names_raw = simEnts_doc.readLines(); - // 遍历相似词文件中的<原词,相似词>对 - for (String clsName_similarName : similar_names_raw) { - String[] pair = clsName_similarName.split(" "); - if (clsImg_EntDoc_justZhao.containsKey(pair[0])) { - clsImg_EntDoc_justZhao.get(pair[0]).add(Pair.createPair(pair[1], sim_ClsImg_EntDoc(pair[0], pair[1]))); + if (similar_names_raw != null) { + // 遍历相似词文件中的<原词,相似词>对 + for (String clsName_similarName : similar_names_raw) { + // source_target[0]是原词,source_target[1]是相似词 + String[] source_target = clsName_similarName.split(","); + if (clsImg_EntDoc_justZhao.containsKey(source_target[0])) { + clsImg_EntDoc_justZhao.get(source_target[0]).add(Pair.createPair(source_target[1], sim_ClsImg_EntDoc(source_target[0], source_target[1]))); + } } } } @@ -195,6 +208,57 @@ public class DocAnalyzer { // check_PEdoc(classes_in_CD); } + /** + * 格式化赵子岩的结果(-ziyan.txt),将其保存到-simEnts.txt中 + * + * 每一行都应该是:原词,相似词 + * + * 其中,原词和相似词都不能有空格(即应该是类名) + */ + private static void processSimDoc(String doc_dir) { + FileEditor res_fe = null; + // 找一下同目录下的赵子岩的结果,然后再格式化 + FileEditor ziyan_fe = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-ziyan.txt"); + if (ziyan_fe.readFileToString() != null) { + res_fe = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-simEnts.txt"); + res_fe.write("", false); + List ziyan_lines = ziyan_fe.readLines(); + for (String ziyan_line : ziyan_lines) { + String[] source_sym_pair = ziyan_line.split(","); + // source的词得去空格 + String source = source_sym_pair[0].replaceAll(" ", ""); + // sym的词得根据其大小写判断是去空格还是保留空格 + String sym = processSym(source_sym_pair[1]); + res_fe.write(source + "," + sym + "\n", true); + } + } + } + + /** + * sym的词得根据其大小写判断是去空格还是保留空格 + * + * @param string + * @return + */ + private static String processSym(String string) { + String res = string.trim(); + // 检查res到底是类名拆开了(temp的每个词都是大写开头)、还是本身就是一个词组(temp中存在小写开头的词) + String[] temp = res.split(" "); + boolean is_clsName = true; + // 看看temp中是否存在小写开头的词 + for (String word : temp) { + if (Character.isLowerCase(word.charAt(0))) { + is_clsName = false; + break; + } + } + // 如果res是类名,则需去空格。否则,直接返回原res即可 + if (is_clsName) { + res = res.replaceAll(" ", ""); + } + return res; + } + /** * 基于规则匹配和概率分析提取实体之间关系 * @@ -211,38 +275,43 @@ public class DocAnalyzer { Set docDir_set = dir_sentences_map.keySet(); /* * 1、杜佳诺的规则部分会直接给出概念间关系R,可信度为R(Ent_doc,Ent'_doc) + * */ - Map> EntDoc_OtherEntDoc_justDu = new HashMap<>(); - // 先将每个类追踪到的Ent_doc都填充进去 - for (String class_short_name : classShortName_set) { - UMLClass Cls_img = classes_in_CD.get(class_short_name); - for (Entity ent : Cls_img.mappedEnt_ls) { - EntDoc_OtherEntDoc_justDu.put(ent.name, new ArrayList<>()); - } - } // 对每个语义分析文件,遍历其中的<源,目标,关系类型>组,以源为key保存关系。追踪概率为R(Ent_doc,Ent'_doc),默认为1 for (String doc_dir : docDir_set) { + // 在源文档的同目录下,会有杜佳诺给出的语义分析结果(格式为:ID, source, target, relation content, relationship type),对其进行格式化处理 + // 其中,source和target包含空格;relationship type包括GENERALIZATION(继承),AGGREGATION(聚合),和ASSOCIATION + processRelationDoc(doc_dir, classes_in_CD); + // 使用格式化处理后的“-relation.txt”文件 FileEditor relation_doc = new FileEditor(doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-relation.txt"); - List relations_raw = relation_doc.readLines(); // 遍历语义分析文件中的<源,目标,关系类型>组 - for (String relation : relations_raw) { - String[] group = relation.split(" "); + List relations_raw = relation_doc.readLines(); + for (String relation_line : relations_raw) { + // source_target_type[0]是源,source_target_type[1]是目标,source_target_type[2]是关系类型 + String[] source_target_type = relation_line.split("&"); // 对每个组,看其“源”是否属于某个Cls_img追踪到的Ent_doc。如果有,则将该Ent_doc取出,为其添加一条EntityRelation,目标和关系类型分别在组的第二、三项 for (String class_short_name : classShortName_set) { - // 遍历每个Cls_img,看其追踪到的Ent_doc是否包含“源” + // 遍历每个Cls_img,看其追踪到的Ent_doc是否是“源” UMLClass Cls_img = classes_in_CD.get(class_short_name); - if (Cls_img.mappedEnt_names.contains(group[0])) { + for (Entity mappedEnt : Cls_img.mappedEnt_ls) { // 如果找到了,则取出该Ent_doc - for (Entity mappedEnt : Cls_img.mappedEnt_ls) { + if (sim_ClsImg_EntDoc(mappedEnt.name, source_target_type[0]) > 0.5) { // 为该Ent_doc添加一条EntityRelation,目标和关系类型分别在组的第二、三项 EntityRelation entity_relation = new EntityRelation(); Entity related_entity = new Entity(); - related_entity.name = group[1]; + // related_Ent的name是很奇怪的长串 + related_entity.name = source_target_type[1]; entity_relation.related_ent = related_entity; - entity_relation.relation_type = getRelationType(group[2]); + // 有些情况下,杜佳诺的结果里不包含关系类型。此时默认为“依赖” + if (source_target_type.length < 3) { + entity_relation.relation_type = "依赖"; + } else { + entity_relation.relation_type = getRelationType(source_target_type[2]); + } // 追踪概率为R(Ent_doc,Ent'_doc),默认为1 entity_relation.R_Entdoc_OtherEntdoc = 1.0; mappedEnt.relations_between_Ent.add(entity_relation); + break; } } } @@ -260,13 +329,12 @@ public class DocAnalyzer { // 每条关系 for (EntityRelation entity_relationship : Ent_doc.relations_between_Ent) { Entity related_entity = entity_relationship.related_ent; - // 形成一个正则表达式,用于匹配文档的每句话、查找至少包含这两个词其一的句子 - String regex = "\\b(" + Ent_doc.name + "|" + related_entity.name + ")\\b"; + // 匹配文档的每句话、查找至少包含这两个词其一的句子 // 在所有设计文档中,根据名称查找相关语句,存到sentence_ls中 - List sentences_containing_at_least_one_ents = getSentencesMatchingRegex(dir_sentences_map, regex); + List> sentences_containing_at_least_one_ents = getSentencesWithEnts(dir_sentences_map, Ent_doc.name, related_entity.name); int and = 0; - for (String sentence : sentences_containing_at_least_one_ents) { - if (sentence.contains(Ent_doc.name) && sentence.contains(related_entity.name)) { + for (Pair sentence : sentences_containing_at_least_one_ents) { + if (sentence.getRight() == 2) { and++; } } @@ -278,7 +346,7 @@ public class DocAnalyzer { } } /* - * 3、针对从图中的类Cls_img追踪到的Ent_doc,在文档中提取Ent_doc与其他Ent'_doc之间的关系 + * 3、针对从图中的类Cls_img追踪到的Ent_doc,计算Ent_doc与其他Ent'_doc之间的关系概率 * * Ent_doc与Ent'_doc具有关系的概率为PR_doc(Ent_doc,Ent'_doc) */ @@ -306,20 +374,20 @@ public class DocAnalyzer { // 将此关系记录到两个类的所有Ent_doc中 for (Entity Ent_doc : Cls_img.mappedEnt_ls) { // 记录此关系:针对当前Ent_doc,检查related_Cls_img追踪的所有related_Ent是否在Ent_doc的关系中。如果有,则概率改为1;如果没有,则添加 - for (Entity related_Ent : related_Cls_img.mappedEnt_ls) { - boolean related = false; + for (Entity relatedCls_mappedEnt : related_Cls_img.mappedEnt_ls) { + boolean already_related = false; for (EntityRelation entity_relationship : Ent_doc.relations_between_Ent) { - if (entity_relationship.related_ent.name.equals(related_Ent.name) && entity_relationship.relation_type.equals(img_relation.type)) { + if (entity_relationship.related_ent.name.equals(relatedCls_mappedEnt.name) && entity_relationship.relation_type.equals(img_relation.type)) { // 如果有,则概率改为1 entity_relationship.PR_doc = 1.0; - related = true; + already_related = true; } } // 如果没有,则添加 - if (!related) { + if (!already_related) { // 为该Ent_doc添加数条EntityRelation,每条关系都指向related_Cls_img追踪到的Ent_doc EntityRelation entity_relation = new EntityRelation(); - entity_relation.related_ent = related_Ent; + entity_relation.related_ent = relatedCls_mappedEnt; entity_relation.relation_type = img_relation.type; entity_relation.PR_doc = 1.0; Ent_doc.relations_between_Ent.add(entity_relation); @@ -372,60 +440,121 @@ public class DocAnalyzer { } } + /** + * 在源文档的同目录下,会有杜佳诺给出的语义分析结果,对其进行格式化处理 + * + * 杜佳诺给出的格式为:ID, source, target, relation content, relationship type + * + * 其中,source和target包含空格;relationship type包括GENERALIZATION(继承),AGGREGATION(聚合),和ASSOCIATION + * + * 在ASSOCIATION里,如果relation content包含implement关键字,则视为实现;否则就是依赖 + * + * 将语义分析结果(excel文件)处理成“-relation.txt”文件,每一行的格式为:源 目标 关系类型 + * + * @param classes_in_CD + */ + private static void processRelationDoc(String doc_dir, Map classes_in_CD) { + String res_dir = doc_dir.substring(0, doc_dir.lastIndexOf(".")) + "-relation.txt"; + FileEditor fe = new FileEditor(res_dir); + fe.write("", false); + try { + // 工作簿 + Workbook book = Workbook.getWorkbook(new File(doc_dir + ".xml.xls")); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("Sheet0"); + // Sheet sheet = book.getSheet(0); + int rows = sheet.getRows(); + int cols = sheet.getColumns(); + // 对每行数据,去除source和target中的空格。解析关系类型 + for (int row = 1; row < rows; row++) { + String record = ""; + for (int column = 0; column < cols; column++) { + switch (column) { + // 去除source中的空格 + case 1: { + record = record + sheet.getCell(column, row).getContents().trim() + "&"; + break; + } + // 去除target中的空格 + case 2: { + record = record + sheet.getCell(column, row).getContents().trim() + "&"; + break; + } + // 解析关系类型。如果遇到ASSOCIATION,则根据relation content判断是否为实现关系 + case 4: { + String relation_type = sheet.getCell(column, row).getContents().trim().replaceAll(" ", ""); + if (relation_type.equals("ASSOCIATION")) { + String relation_content = sheet.getCell(column - 1, row).getContents().trim().replaceAll(" ", ""); + if (relation_content.contains("impl")) { + record = record + "实现"; + } else { + record = record + "依赖"; + } + } else if (!relation_type.equals("ATTRIBUTE")) { + record = record + relation_type; + } + break; + } + default: { + break; + } + } + } + fe.write(record + "\n", true); + } + book.close(); + } catch (BiffException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + /** * 根据杜佳诺给出的“关系类型+关系描述”,分析出真正的关系类型,包括“继承”、“实现”、“依赖”、“聚合”四种 */ private static String getRelationType(String raw) { String res = "依赖"; - // 该分隔符与杜佳诺商议 - String sep = ""; - String[] type_desc = raw.split(sep); - if (type_desc.length == 2) { - if (type_desc[1].contains("extend") || type_desc[1].contains("继承")) { - res = "继承"; - } - if (type_desc[1].contains("impl") || type_desc[1].contains("实现")) { - res = "实现"; - } - if (type_desc[1].contains("aggr") || type_desc[1].contains("聚合")) { - res = "聚合"; - } - } else { - if (type_desc[0].contains("extend") || type_desc[0].contains("继承")) { - res = "继承"; - } - if (type_desc[0].contains("impl") || type_desc[0].contains("实现")) { - res = "实现"; - } - if (type_desc[0].contains("aggr") || type_desc[0].contains("聚合")) { - res = "聚合"; - } + if (raw.contains("extend") || raw.contains("generaliz") || raw.contains("继承")) { + res = "继承"; + } + if (raw.contains("impl") || raw.contains("实现")) { + res = "实现"; + } + if (raw.contains("aggr") || raw.contains("聚合")) { + res = "聚合"; } return res; } /** * 在所有设计文档中,根据类名查找相关语句,存到sentence_ls中 + * + * name1是文档中出现的类名。name2是很奇怪的长串 + * + * @param name */ - private static List getSentencesMatchingRegex(Map> doc_map, String regex) { - List res = new ArrayList<>(); - Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); - Matcher matcher = null; + private static List> getSentencesWithEnts(Map> doc_map, String name1, String name2) { + List> res = new ArrayList<>(); // 读doc、找UML_class、存到sentence_ls中 Collection> docs = doc_map.values(); for (List doc : docs) { for (String raw_sentence : doc) { if (!raw_sentence.isBlank()) { // 找UML_class - matcher = pattern.matcher(raw_sentence); - while (matcher.find()) { - if (!res.contains(raw_sentence)) { - // 保存包含某类名的句子 - res.add(raw_sentence); - // 可以看看这句话是什么。最后注释掉 - // System.out.println(raw_sentence); - } - break; + int count = 0; + if (raw_sentence.contains(name1) && raw_sentence.contains(name2)) { + count = 2; + } else if (raw_sentence.contains(name1) && !raw_sentence.contains(name2)) { + count = 1; + } else if (!raw_sentence.contains(name1) && raw_sentence.contains(name2)) { + count = 1; + } + if (count > 0) { + // 保存包含某类名的句子 + res.add(Pair.createPair(raw_sentence, count)); + // 可以看看这句话是什么。最后注释掉 + // System.out.println(raw_sentence); } } } diff --git a/src/main/java/com/hy/java/uct/cdtocode/util/Entity.java b/src/main/java/com/hy/java/uct/cdtocode/util/Entity.java index c18d0ab78166cb05751603c6de2ab11477f720c5..77f5338336fa6fed9bbf2356e165f97aafee6535 100644 --- a/src/main/java/com/hy/java/uct/cdtocode/util/Entity.java +++ b/src/main/java/com/hy/java/uct/cdtocode/util/Entity.java @@ -9,6 +9,9 @@ import com.hy.java.utility.common.Pair; public class Entity { public UMLClass parent_class; + /** + * 注意,如果是从赵子岩那儿追过来的Ent_doc,那么name就是文档中出现的类名。如果是related_Ent的name,那就是很奇怪的长串。 + */ public String name; /** * 基于启发式匹配规则(模糊匹配和词性分析)和概率的追踪 diff --git a/src/main/resources/cdtocode/Apache OODT File Manager.xls b/src/main/resources/cdtocode/Apache OODT File Manager.xls new file mode 100644 index 0000000000000000000000000000000000000000..be9436a6b680c54fd213be2eda53e909fa6e657f Binary files /dev/null and b/src/main/resources/cdtocode/Apache OODT File Manager.xls differ diff --git a/src/main/resources/cdtocode/Hadoop HDFS.xls b/src/main/resources/cdtocode/Hadoop HDFS.xls new file mode 100644 index 0000000000000000000000000000000000000000..a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64 Binary files /dev/null and b/src/main/resources/cdtocode/Hadoop HDFS.xls differ diff --git a/src/main/resources/cdtocode/Hadoop MapReduce.xls b/src/main/resources/cdtocode/Hadoop MapReduce.xls new file mode 100644 index 0000000000000000000000000000000000000000..a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64 Binary files /dev/null and b/src/main/resources/cdtocode/Hadoop MapReduce.xls differ diff --git a/src/main/resources/cdtocode/cd/cd-Apache OODT File Manager.txt b/src/main/resources/cdtocode/cd/cd-Apache OODT File Manager.txt index 2395f46f95c58a70be6e56d9b142acee12af382d..4115282bc811be239492f2cc896c529daab4ed73 100644 --- a/src/main/resources/cdtocode/cd/cd-Apache OODT File Manager.txt +++ b/src/main/resources/cdtocode/cd/cd-Apache OODT File Manager.txt @@ -1,60 +1,27 @@ -(611,446)AbstractHandler -@@@AbstractHandler -%AbstractLifeCycle -%继承¥AbstractHandler -%Handler -%继承¥@#(165,446)AbstractConnector -@@@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractConnector -%Connector -%继承¥@#(366,303)AbstractLifeCycle -@@+doStart() -+doStop() -@AbstractLifeCycle -%LifeCycle -%继承¥@AbstractConnector -%AbstractLifeCycle -%继承¥AbstractHandler -%AbstractLifeCycle -%继承¥#(167,204)Connector -@+host: String -+port: int -@@Connector -%Buffers -%实现¥Connector -%LifeCycle -%实现¥@AbstractConnector -%Connector -%继承¥#(616,205)Handler -@@+handle(target,request,...) -@Handler -%LifeCycle -%实现¥@AbstractHandler -%Handler -%继承¥#(470,204)ThreadPool -@@+dispatch(Runnable) -@ThreadPool -%LifeCycle -%实现¥@#(110,19)Buffers -@@+getBuffer(size): Buffer -+returnBuffer(Buffer) -@Buffers -%Buffer -%依赖¥@Connector -%Buffers -%实现#(16,19)Buffer -@@@@Buffers -%Buffer -%依赖#(394,17)LifeCycle -@@+start() -+stop() -@@AbstractLifeCycle -%LifeCycle -%继承¥Connector -%LifeCycle -%实现¥ThreadPool -%LifeCycle -%实现¥Handler -%LifeCycle -%实现¥# \ No newline at end of file +(611,446)Product +@@@Product +%ProductType +%依赖¥Product +%Metadata +%依赖¥Product +%Reference +%依赖¥@#(165,446)Reference +@@@@Product +%Reference +%依赖¥#(366,303)Metadata +@@@@Product +%Metadata +%依赖¥#(167,204)ProductType +@@@ProductType +%Versioner +%依赖¥ProductType +%Element +%依赖¥@Product +%ProductType +%依赖¥#(616,205)Versioner +@@@@ProductType +%Versioner +%依赖¥#(470,204)Element +@@@@ProductType +%Element +%依赖¥# \ No newline at end of file diff --git a/src/main/resources/cdtocode/code/code path b/src/main/resources/cdtocode/code/code path index 39b9e7642207e8030f4c58804e8b12e7cecf1e07..31118504bd339b5a3d8c18b9c3fe0e2d76ad84e2 100644 --- a/src/main/resources/cdtocode/code/code path +++ b/src/main/resources/cdtocode/code/code path @@ -1 +1 @@ -D:\eclipse-committers\jetty.project-jetty-9.4.41.v20210516 \ No newline at end of file +D:\eclipse-committers\Apache OODT File Manager \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-relation.txt index c92ae4773b74b21b9aa7b209b085a9f0f86cb88b..cac5cd6101b4bec2c7118f647b883acbc68e0b00 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-relation.txt @@ -1,637 +1,742 @@ -A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE -STREAMS: THE V-FASTR EXPERIMENT AS A CASE STUDY -Andrew F. Hart, Luca Cinquini, Shakeh E. Khudikyan, David R. Thompson, -Chris A. Mattmann, Kiri Wagstaff, Joseph Lazio, and Dayton Jones -Jet Propulsion Laboratory, California Institute of Technology, Pasadena, CA 91109, USA; andrew.f.hart@jpl.nasa.gov -Received 2014 March 24; accepted 2014 August 10; published 2014 December 16 -ABSTRACT -“Fast radio transients” are defined here as bright millisecond pulses of radio-frequency energy. These shortduration -pulses can be produced by known objects such as pulsars or potentially by more exotic objects such as -evaporating black holes. The identification and verification of such an event would be of great scientific value. This -is one major goal of the Very Long Baseline Array (VLBA) Fast Transient Experiment (V-FASTR), a softwarebased -detection system installed at the VLBA. V-FASTR uses a “commensal” (piggy-back) approach, analyzing -all array data continually during routine VLBA observations and identifying candidate fast transient events. Raw -data can be stored from a buffer memory, which enables a comprehensive off-line analysis. This is invaluable for -validating the astrophysical origin of any detection. Candidates discovered by the automatic system must be -reviewed each day by analysts to identify any promising signals that warrant a more in-depth investigation. To -support the timely analysis of fast transient detection candidates by V-FASTR scientists, we have developed a -metadata-driven, collaborative candidate review framework. The framework consists of a software pipeline for -metadata processing composed of both open source software components and project-specific code written -expressly to extract and catalog metadata from the incoming V-FASTR data products, and a web-based data portal -that facilitates browsing and inspection of the available metadata for candidate events extracted from the VLBA -radio data. -Key words: catalogs – methods: data analysis – pulsars: general – radio continuum: general -1. INTRODUCTION -One of the current primary goals of radio astronomy is to -explore and understand the “dynamic radio sky” (Cordes -et al. 2004). In contrast to generating catalogs of known -sources, this scientific thrust focuses on transient events, or -transient signals generated by persistent yet time-varying -sources. We do not yet fully understand the scope and -distribution of different transient sources, which range from -the known (e.g., active galactic nuclei, brown dwarfs, flare -stars, X-ray binaries, supernovae, gamma-ray bursts) to the -probable (e.g., exoplanets), to the possible (e.g., ET -civilizations, annihilating black holes). As noted by Cordes -et al. (2004, p.14), “most exciting would be the discovery of -new classes of sources” (italics in original). Radio telescopes -continue to increase their data collecting abilities, observing -the sky with progressively finer time resolution. Of current -particular interest is the detection and characterization of -“fast radio transients,” which last for only small fractions of a -second. -The V-FASTR experiment (Wayth et al. 2011) is one of a -new breed of radio astronomy experiments specifically -targeting fast transient radio signals. The experiment is -conducted in a fully commensal (passive) fashion, searching -for signals in the data gathered during the regular processing -activities of its host instrument. Unlike more traditional, -single-telescope observations, however, the V-FASTR -experiment simultaneously utilizes anywhere between 2 and -10 telescopes of the National Radio Astronomy Observatory -ʼs (NRAO) Very Long Baseline Array (VLBA) (Romney -2010). The VLBA consists of 10 25 m telescopes that are -positioned geographically such that no 2 are within each -otherʼs local horizon, and the V-FASTR experiment -leverages this configuration to better discriminate between -instances of terrestrial Radio Frequency Interference (RFI) -and potentially genuine astronomical pulses (Thompson -et al. 2011). -The huge volumes of raw time-series voltage data generated -by the VLBA in the course of its operation make storing the -full record of an entire observing session infeasible at the -present time. As a consequence, considerable effort has been -devoted to developing and fine-tuning algorithms for the realtime -identification of potentially interesting signals in the noisy -and often incomplete data (Thompson et al. 2011; Wayth et al. -2012). All data selected by the real-time processing step is -subsequently inspected, on a daily basis, by members of the -geographically distributed V-FASTR science team and either -discarded as spurious or archived offline for full analysis at a -later date. -The V-FASTR experiment must therefore operate within -several important resource constraints: the inability to archive -the full observational record due to space constraints, and a -practical workload constraint upon the human analysts -reviewing candidate detections. To address the latter, we have -developed a metadata-driven, collaborative candidate review -framework for the V-FASTR experiment. The framework -comprises a set of software components dedicated to the -automatic capture and organization of metadata describing the -candidate events identified as interesting by the automated -algorithms, and an online environment for the collaborative -perusal and inspection of related imagery data by the V-FASTR -analysis team. -The rest of this paper describes the system as follows. In -Section 2 we describe our project in a more general context. -Section 3 presents the methodology and an architectural -description of the system. We follow with an evaluation of -The Astronomical Journal, 149:23 (7pp), 2015 January doi:10.1088/0004-6256/149/1/23 -© 2015. The American Astronomical Society. All rights reserved. -1 -our experience deploying the framework in Section 4, and -offer conclusions and future directions for the work in -Section 5. -2. BACKGROUND -To better understand the context of the system implementation -presented in Section 3, we first briefly introduce the VFASTR -experiment and describe the development of scientific -data systems at the NASA Jet Propulsion Laboratory (JPL). -We then describe the Object Oriented Data Technology -(OODT) project, an open source information integration -platform that plays a central role in our framework. Finally, -we briefly touch upon several related efforts at developing -online tools to collaboratively classify and validate scientific -observations. -2.1. V-FASTR: The VLBA Fast TRansients Experiment -V-FASTR (VLBA Fast TRansients) is a data analysis -system used by the VLBA to detect candidate fast transient -events. Principal investigators submit observing proposals to -the VLBA targeted at galaxies, supernovae, quasars, pulsars, -and more. V-FASTR analyzes all data collected by the VLBA -as part of routine processing and produces a nightly list of -candidates identified within the data processed that day. The -raw data for each candidate is temporarily saved in case it is -needed to interpret or follow up on a particularly promising or -unusual detection. However, the raw data consumes significant -disk space and therefore the candidate list must be reviewed on -a timely basis by experts. False positives can be deleted and -their disk space reclaimed, while truly interesting events can be -re-processed to enable the generation of a sky image to localize -the source of the signal. Software tools that streamline and -simplify this review process are therefore highly valued by -candidate reviewers and can have a positive impact on other -similar efforts throughout the world. -2.2. Data System Development at JPL -The Data Management Systems and Technologies group at -the JPL develops software ground data systems to support -NASA science missions. These pipelines are specifically -optimized to support the data-intensive and computationallyintensive -processing steps often needed to convert raw remotesensing -observations into higher level data products at scale so -that they can be interpreted by the scientists. The process -almost always involves developing a close collaboration with -project scientists to obtain an understanding of the processing -algorithms involved, a sense of the scale and throughput -requirements, and other operational constraints of the expected -production environment. -Over the years the group has developed a diverse portfolio of -data system experience across a broad spectrum of domains -including earth and climate science (Mattmann et al. 2009; -Hart et al. 2011; Tran et al. 2011), planetary science, -astrophysics, snow hydrology, radio astronomy, cancer -research (Crichton et al. 2001), and pediatric intensive care -(Crichton et al. 2011). -2.3. Open Source and OODT -One of the products of this long track record of experience in -the realm of scientific data processing systems is a suite of -software components known as OODT1 originally arose out of -a desire on the part of NASAʼs Office of Space Science to -improve the return on investment for individual mission data -systems by leveraging commonalities in their design to create a -reusable platform of configurable components, on top of which -mission-specific customizations could be made. OODT thus -represents both an architecture and a reference implementation. -Its components communicate with one another over standard, -open protocols such as XML-RPC2 and can be used either -individually, or coupled together to form more complex data -processing pipelines. -In 2009 OODT began the transition from a JPL-internal -development project to a free and open source software project -at the Apache Software Foundation (ASF).3 Graduating to a -top-level project in 2011, OODT has since undergone several -public releases at the ASF and is in use by a varied group of -scientific and commercial endeavors. As we will describe -further in Section 3, several OODT components form the core -platform of our candidate validation framework. The ready -availability of OODT components under a liberal license, -combined with their substantial pedigree was appealing to our -project both for time and budgetary considerations. -2.4. Related Work -In the following section we identify several ongoing efforts -that also utilize online tools to assist in the collaborative review -and classification of scientific observations. -2.4.1. Astropulse -Astropulse is part of a series of sky surveys for radio pulses -being conducted by the Search for Extraterrestrial Intelligence -(SETI) at the University of Berkeley (Siemion et al. 2010). -The Astropulse project conducts a survey of the sky from the -Arecibo Observatory in Puerto Rico, searching for short -(microsecond) broadband radio frequency pulses. While -Astropulseʼs use of Areciboʼs enormous single dish telescope -affords excellent sensitivity, V-FASTRʼs ability to perform -continent-scale baseline interferometery yields much greater -positional accuracy when attempting to localize the source of a -signal. -As a variant of the SETI@home project, Astropulse utilizes the -same distributed, collaborative volunteer computing infrastructure -accumulated over the years by that effort to perform a -number of computationally intense transformations and calculations -of the data in an attempt to better classify the origin of any -signals detected. The use of volunteer computing to perform units -of computational work is an appealing approach that obviates the -need to directly acquire sufficient hardware for the processing -demands. However, the fully automated nature of the approach is -not a natural fit for V-FASTRʼs manual review requirement. -2.4.2. Galaxy Zoo -GalaxyZoo4 is an Internet-based project that relies on the -help of volunteers to classify a very large database of galaxy -images recorded by either the Sloan Digital Sky Survey or the -Hubble telescope. Users are asked to classify galaxies based on -1 Apache OODT: http://oodt.apache.org/ -2 XML-RPC: http://xmlrpc.scripting.com/spec.html -3 http://apache.org/ -4 Galaxy Zoo: http://www.galaxyzoo.org/ -2 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -shape, color and direction of rotation, and report on possible -unidentified features. The rationale behind human intervention -is that manual classification is more accurate and insightful -than any algorithm that can currently by undertaken by an -automatic program. To date, the project has met with success -that far exceeded expectations: more than 250,000 volunteers -have helped classify millions of images, resulting in the -confirmation of important scientific hypothesis, the formulation -of new ones, and the discovery of new interesting objects. -While Galaxy Zooʼs tactic of appealing to volunteers to -mitigate the challenge of image classification at scale is -attractive, the paradigm does not translate well to the V-FASTR -setting due to differences in the nature of the archives between -the two projects. Whereas Galaxy Zoo permits its volunteer -reviewers to leisurely peruse and mine a largely static image -archive, the rapidly growing data volumes associated with -ongoing V-FASTR observations dictate that reviews must be -regularly scheduled to keep the project within its resource -limits. -2.4.3. Foldit: The Protein Folding Game -Foldit (Cooper et al. 2010) is a collaborative online protein -folding game developed by the Center for Game Science -at the University of Washington, and it represents a -“crowd-sourced” attempt to solve the computationally challenging -task of predicting protein structure. Proteins, chains of -amino acids, play a key role in a wide range of human diseases, -but comparatively little is known about how they contort -themselves into the specific shapes that determine their -function. Because of the scale and complexity of the challenge, -the researchers behind Foldit have turned to the puzzle-solving -capabilities of human beings for assistance. After learning the -rules on simple challenges, players compete against one -another to design alternative protein structures, with the goal -of arriving at an arrangement that minimizes the total energy -needed to maintain the shape. -Foldit has created an environment in which the unknown and -diverse strategies of its human participants become a core -strength. Furthermore, by presenting the scientific activity as a -competitive game, the project, which currently boasts over -400,000 players, has shown that it is possible to recruit and -leverage human processing power at scale. This provides an -interesting model for other projects, including V-FASTR, -which at some point may rely upon a human element to -augment or improve automated processes. -3. IMPLEMENTATION -In this section we provide details on the implementation of -our metadata-driven framework for online review of V-FASTR -candidate detection events. We describe our methodology and -the considerations that informed our design, followed by a -presentation of the system architecture. -3.1. Development Methodology -Several factors influenced the development process and have -left their imprint on the final architecture. We feel that our -implementation is uniquely suited to the needs of the VFASTR -project precisely because these factors were identified -early on and were thus able to influence all aspects of the -design process. -3.1.1. Collaboration -As described in Section 2, our group has developed -substantial experience in the design and implementation of -data systems for a broad range of scientific domains. In each -case, a close working relationship with members of the project -science team was an essential ingredient to the success of the -project, and our experience developing an online candidate -review framework for V-FASTR was no different. As software -engineers familiar with the challenges inherent in scientific data -management, our intuitions about the technical challenges of -the system served us well in scoping out the project timeline. -However, it was our early and regular communication with -members of the V-FASTR science team that was critical to -obtaining the domain knowledge necessary to make accurate -assumptions, and in the early identification of issues. The -current system architecture, covering both the back and front -end elements, is a direct result of an ongoing feedback loop -between the science and software teams. -3.1.2. Constraints -As mentioned in Section 2, V-FASTR is a commensal -experiment that scans for fast transients in data that is already -being collected as part of the regular third-party use of the -VLBA instrument. As such, the experiment maintains a “guest” -status on the NRAO computing infrastructure. Consequently, -care must consistently be taken not to overtax NRAO system -resources, including disk storage, CPU time, and network -bandwidth. These physical constraints motivated many of the -architectural decisions described in the following sections. -Each V-FASTR data product may contain hundreds of files, -rooted at a top-level job directory, and includes two types of -products: filterbank data (up to ~100 GB per job) and -baseband voltage data (up to ~10 GB per job). The total data -storage capacity available to V-FASTR is just ~8 TB, enough -to contain ~800 jobs of ~10 GB each (on average). Because -products are produced at a average rate of ~10–20 per day (but -sometimes in the hundreds), the storage would be exhausted -within a few weeks unless products are periodically reviewed -by the science team analysts. During review, each candidate is -either flagged for higher-resolution processing (and saved) or -discarded as a false positive and the disk space reclaimed (see -Figure 1 for an overview of the average data volumes per job at -different processing stages). The desire to provide analysts with -a streamlined method for this review process is at the very core -of our design. -Similarly, the network bandwidth constraints of the host led -us to a data transfer configuration that focused on metadata -rather than requiring the complete transfer of raw, unreviewed, -and possibly spurious detection data over the Internet. Instead, -metadata sufficient to describe the salient characteristics of a -candidate event to a trained analyst was transferred into our -candidate review framework. This careful selection process had -the beneficial side effect of greatly limiting the size of the -transferred products, allowing for a considerably longer -retention period on the ~10 TB archive hosted at JPL. -Finally, security constraints were also critically important to -the design, particularly because the system spans two separate -security domains: NRAO and JPL. To comply with the security -requirements of the host system, data transfer was configured -on the NRAO system to allow read-only operations and was -made accessible only to clients originating from the JPL -3 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -domain. Furthermore, on the front-end, the functionality -exposed by the web portal component interacted only with -the local metadata archive, eliminating the possibility of -corruption or inappropriate access to the raw observational -data. -3.2. Architecture -As previously mentioned, the candidate review framework is -driven by metadata describing the candidate events to be -reviewed by V-FASTR analysts. To communicate this data -from the raw source repository at the NRAO to an analyst using -a browser anywhere in the world, we developed a software -framework consisting of two principal components: a metadata -pipeline that manages the capture, transfer, and storage of -metadata annotations, and a web portal which provides analysts -with a convenient, context-rich environment for efficiently -classifying candidate events. -3.2.1. Metadata Pipeline -On the JPL side, the V-FASTR data products are processed -through a metadata extraction and data archiving pipeline that -eventually leads to the event candidates being available for -inspection on the web portal. The pipeline is composed of three -major software components: rsync, the OODT CAS Crawler, -and the OODT File Manager, depicted in Figure 2. -rsync. Data products are automatically transferred from the -NRAO staging area to the JPL server using rsync. rsync is a -popular application and data transfer protocol that allows to -synchronize the content of a directory tree between two -servers with minimal human intervention. It was chosen -because of its simplicity, high performance, reliability, and -wide range of configuration options. Through rsync, files are -transferred in compressed format and using delta encoding, -meaning that only the file differences are sent through -subsequent transfers. For this project, an rsync server -daemon was set up on the NRAO side to expose the data -staging area where the products are collected. For security -reasons, the daemon was restricted to allow read-only -operations to clients originating from a designated JPL IP -address. On the JPL side, an rsync client was set up to run -hourly as a system cron job, transferring products to the JPL -archive area. To minimize bandwidth usage, the client only -transfers a very small subset of the data comprising a product -directory tree, namely the detection images and the output -and calibration files containing the metadata needed by the -web portal. On average, this represents a reduction of the -data product size by a factor of 3.5 ´ 103: from an average -size of ~35 GB on the NRAO server (for a product with -several detections), to ~10 MB on the JPL server. The rsync -data transfer rates between the two servers were measured to -be around ~2 MBs-1, more than enough to transfer between -10 and 20 data products per day. -CAS Crawler. Once the data products are transferred to the -JPL server, they are automatically detected by the OODT -CAS Crawler daemon, which runs at sub-hour time intervals -to pick up new products as soon as they become available. -The Crawler is responsible for notifying the OODT File -Manager and therefore starting the product ingestion process. -For this deployment, the Crawler was configured to send a -signal only if two preconditions are both satisfied: (1) a -similarly named product does not already exist in the File -Manager catalog and (2) the product directory contains a -special marker file indicating that the product has been -processed by the mail program, and therefore is in a complete -state (i.e., no files are missing). -CAS File Manager. The OODT CAS File Manager is a -customizable software component that is responsible for -processing and archiving a data product, making it available -for query and access to clients. For this project, the File -Manager was deployed with the default Apache Lucene -metadata back-end, and configured to archive products -Figure 1. Depiction of the full V-FASTR data flow with volume estimates (per job) at each stage. The candidate review framework (both metadata pipeline and web -portal components) interact with the metadata and derived products repository at the intersection of A and B above. -4 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -in-place, i.e., without moving them to a separate archive -directory, otherwise the rsync process would transfer them -again from the NRAO server. Additionally, we leveraged the -extensibility of the OODT framework by configuring the File -Manager with custom metadata extractors that were -purposely written to parse the information contained in the -V-FASTR output and calibration files. Information is -extracted at the three levels that comprise the hierarchy of -a V-FASTR data product: job, scan, and event. Additionally, -a numerical algorithm was written to assign each pair of -available images (-det.jpg and -dedisp.jpg) to the event that -generated them. -In general, a File Manager can store metadata in its back-end -catalog as different object types. Each object type is defined to -contain multiple metadata fields, where each field is composed -of a named key associated to one or more string values. For this -project, the decision was made to maintain a one-to-one -correspondence between a data product and the corresponding -metadata ingested into the catalog. So rather than defining three -object types for jobs, scans, and events, a single object type -was used holding all information for a data product in a single -container, with dynamically named keys that are encoded to -contain the scan and event numbers. This decision was -motivated by the desire to simplify and optimize the querying -of information by the web portal client, since all metadata for a -product is retrieved through a single request to the File -Manager. As a consequence, the default Apache Lucene -metadata catalog implementation had to be slightly modified -to allow for the ingestion of dynamically named metadata -fields. -3.2.2. Web Portal -The second major component of the candidate review -framework is an interactive web portal. The primary purpose -of the portal is to provide a convenient online environment for -the location-independent perusal and assessment of potential -candidates in context. The portal provides V-FASTR analysts -with the ability to quickly navigate through the available -information to identify candidates worthy of further inspection -on a familiar web platform. -The portal has been implemented as a PHP web application -using the Apache OODT Balance web framework running on -top of the Apache HTTPD Web Server. OODT Balance was -chosen here for its ability to easily integrate with the OODT -components in the back-end metadata pipeline, namely the -OODT CAS File Manager described earlier. Furthermore, the -flexible, modular approach of the framework allowed us to -quickly connect the web portal to the metadata repository and -rapidly begin constructing the necessary views specific to the -V-FASTR candidate review and validation use cases. -As Figure 3 shows, the web portal offers a variety of views -of the available metadata which are hierarchically organized to -match the conceptual relationships in the data. At the highest -level, a job or run might consist of multiple scans, each of -which may itself contain multiple detection event candidates. -This hierarchy, expressed in the metadata, is preserved in the -layout of the portal views, and the breadcrumb navigation -provided to facilitate orientation within the nested structure. -At the level of an individual event candidate (Figure 3, -middle image), two graphical representations of the event are -available to assist analysts in classifying the nature of the -signal. These images are generated automatically as part of the -initial candidate identification process (Wayth et al. 2011), and -they provide a trained analyst the necessary structural clues -needed to rapidly assess the received signal as being genuinely -extraterrestrial in origin or merely a product of RFI. -To support both metadata browsing in context and the desire -for an analyst to be able to rapidly peruse the image -representations of an entire job (many events in many scans) -at once, a compromise was struck whereby, for each job, a -portal user may select a traditional, hierarchical navigation or a -flattened view in which all of the (possibly numerous) event -candidates are presented simultaneously on screen and can be -accessed serially simply by scrolling the view. -Figure 2. Component diagram for the metadata pipeline component of the VFASTR -candidate review framework. -5 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -Together, the metadata pipeline and the web portal constitute -an end-to-end framework for capturing, archiving, and -presenting metadata about detected transient event candidates -to V-FASTR scientists. Furthermore, by providing a reliable -process and flexible interface, the system directly streamlines -the analysis process, boosting the overall efficiency of the -project. -4. EVALUATION -As we have described in the previous section, the candidate -review framework embraces the model of online collaborative -validation of fast transient candidates by a team of geographically -dispersed analysts, and improves the efficiency with -which analysts may classify observational data. In this section -we describe the early results of our experience with the -operational deployment of the framework, as well as highlight -several areas for the evolution of the tool to further enhance its -utility. -4.1. Experience -The initial deployment of the collaborative review framework -for operational use by the V-FASTR science team was -made in early summer 2012. The immediate feedback was -largely positive: analysts praised the capabilities of the system, -the general improved accessibility afforded by a web-based -user interface, and the newfound capability to easily navigate -rapidly through all detections in a given job, or peruse the -different levels (scans and events) within a job individually. -The biggest initial complaint with the system was that too -many mouse clicks were required to complete an analysis of all -of the candidates in an entire job. -A consequence of the iterative feedback loop that developed -between the software and science teams (described further in -Section 3) was that suggestions for improvements were -repeatedly made, tracked, and acted upon. This process resulted -in an updated release occurring approximately every two weeks -during the first few months of the deployment. Suggestions for -improvements included the addition of various metadata fields -identified as critical to the classification task, updates to the -visual organization of the elements of the web portal views, and -a relentless focus on reducing the number of mouse clicks -required on the part of analyst users. -By the time of this writing, the V-FASTR portal has been -running operationally for several weeks, and we can draw some -early conclusions on usefulness of the system. Overall, as -reported by the science team, it seems like the project has -definitely accomplished its broad goal of facilitating the -collaborative task of inspecting and screening radio-transient -events. By extracting all relevant metadata from the latest data -products, and presenting it on the web portal in a concise -fashion, scientists can now execute their tasks more efficiently, -compared to earlier times when they had to log onto a terminal -and analyze the raw data manually. Additionally, the online -availability of all data and metadata through a browser interface -(as opposed to an ssh terminal) has allowed for greater -flexibility with regard to when and where evaluations can be -performed, including for the first time on a mobile device. -4.2. Evolution -On the whole, the ability to interact with the totality of the -candidate data and metadata through a browser interface has -greatly expanded the analysts’ ability to perform their tasks -with greater flexibility regarding when and where evaluations -can be performed. This includes, for the first time, anecdotal -accounts of an analyst reviewing candidates from a mobile -device. -With this freedom, in turn, has come a number of feature -requests which can be taken together to form a roadmap of -sorts for the evolution of the framework. Now that the -interaction with candidate metadata has transitioned to the -browser, the science team has identified three key features they -feel would complete the transition and entirely replace the prior -ad-hoc methods for coordinating the analysts’ activities: -Job assignment. As mentioned in Section 3, the timely -review of detection candidates is critical to remaining within -the resource constraints imposed upon the experiment. At the -moment, review jobs are assigned to analysts via email. -Augmenting the web portal with the ability to identify an -individual analyst would enable the presentation of -a prioritized list of that analystʼs outstanding review tasks. -Effort Tracking. Along the same lines, it is important to -spread the analysis load evenly across the science team, since -no one person is performing the analysis as his or her fulltime -job. Augmenting the framework with the ability to track -the analysis contributions of individual users over time -would assist in the equitable scheduling of future -review jobs. -Figure 3. Screen shots of the initial version of the web portal component. From left to right: the portal home page displaying recent jobs and associated event counts, -image metadata associated with an individual event candidate, full metadata listing, including associated scans, for an observation job. -6 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -In-browser archiving. When an analyst determines a -candidate event merits high-resolution followup, the last -step is to archive the associated raw data so that it can be -evaluated at a later date. Currently, due to the security -restrictions permitting read-only access to external connections -to the archive at the NRAO (described in Section 3), -this process is handled out-of-band by the analyst logging -into an NRAO machine and archiving the appropriate data -manually. It is possible that, with the identity management -features discussed in the previous two items (and the -associated auditing capabilities that it could entail) the -restrictions might be negotiated to the point that certain -defined activities (such as archiving a single job directory) -could be initiated from within the portal environment. -5. CONCLUSION -V-FASTR, and commensal operations more generally, are -particularly challenging experiments due to extreme data -volume and real-time requirements. Processing occurs continually, -and the data flow must be coordinated across multiple -physical locations with transport mechanisms ranging from -FedEx transport (disks from the antenna), high-bandwidth -interconnects (the correlator and transient detection systems), -daily rsync over IP (the ska-dc mirror), and distributed WWW -protocols (manual review which takes place by analysts on -three continents). Various components of the system operate on -millisecond, hourly, and daily clocks and all components must -continue operating since there is very little margin for buffer -resources. In addition, the data processing components are -highly heterogeneous, with human experts playing their own -role as scheduled pattern recognition engines in the overall -architecture. By facilitating timely review, and reducing the -learning curve for new reviewers, the V-FASTR portal will -play a critical role in keeping the data flowing and making the -system sustainable in the long term. -This effort was supported by the Jet Propulsion Laboratory, -managed by the California Institute of Technology under a -contract with the National Aeronautics and Space -Administration. -REFERENCES -Cooper, S., Khatlib, F., & Treuille, A. 2010, Natur, 466, 756–60 -Cordes, J., Lazio, T., & McLaughlin, M. 2004, NewAR, 48, 1459–72 -Crichton, D., Kincaid, H., Downing, G., Srivastava, S., & Hughes, J. S. 2001, -in Proc. of the 14th IEEE Symp. on Computer-Based Medical Systems, An -Interoperable Data Architecture for Data Exchange in a Biomedical -Research Network (Piscataway, NJ: IEEE), 65–72 -Crichton, D., Mattmann, C., Hart, A., et al. 2011, in Proc. of the 24th -IEEE Symp. on Computer-Based Medical Systems An Informatics -Architecture for the Virtual Pediatric Intensive Care Unit (Piscataway, -NJ: IEEE), 1–6 -Hart, A., Goodale, C., Mattmann, C., et al. 2011, in Proc. of the 2nd Int. -Workshop on Software Engineering for Cloud Computing, A Cloudenabled -Regional Climate Model Evaluation System (New York: -ACM), 43–49 -Mattmann, C., Crichton, D., Medvidivic, N., & Hughes, J. S. 2006, in Proc. -2006 Int. Conf. on Software Engineering, A Software Architecture-based -Framework for Highly Distributed and Data Intensive Scientific -Applications (New York: ACM), 721–30 -Mattmann, C., Freeborn, D., Crichton, D., et al. 2009, in Proc. IEEE Int. Conf. -on Space Mission Challenges for Information Technology, A Reusable -Process Control System Framework for the Orbiting Carbon Observatory -and NPP Sounder PEATE Missions (Piscataway, NJ: IEEE), 165–72 -Romney, J. D. 2010, NRAO, http://www.vlba.nrao.edu/astro/obstatus/current/ -obssum.html. -Siemion, A., von Korff, J., McMahon, P., Korpela, E., & Werthimer, D. 2010, -AcAau, 67, 1342–9 -Thompson, D., Wagstaff, K., Brisken, W., et al. 2011, ApJ, 735, 98 -Tran, J., Cinquini, L., Mattmann, C., et al. 2011, in Evaluating Cloud -Computing in the NASA DESDynI Ground Data System Proc. of the II -International Workshop on Software Engineering for Cloud Computing -(New York: ACM), 36–42 -Wayth, R., Brisken, W., Deller, A., et al. 2011, ApJ, 735, 97 -Wayth, R., Tingay, S., & Deller, A. 2012, ApJL, 753, L36 -7 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. \ No newline at end of file +v-fastr experiment&a case study andrew f. hart and Pasadena and CA 91109 and USA&依赖 +FRAMEWORK&radio-frequency energy&依赖 +California Institute&Technology&AGGREGATION +v-fastr experiment&a case study andrew f. hart and Pasadena and CA 91109 and USA&依赖 +FRAMEWORK&bright millisecond pulse&依赖 +bright millisecond pulse&radio-frequency energy&AGGREGATION +shortduration pulse&known object&依赖 +shortduration pulse&pulsar&依赖 +identification and verification&event&AGGREGATION +one major goal&very long baseline array ( vlba ) fast transient experiment ( v-fastr )&AGGREGATION +V-FASTR&“ commensal ” ( piggy-back ) approach&依赖 +Raw datum&buffer memory&依赖 +buffer memory&comprehensive off-line analysis&依赖 +astrophysical origin&detection&AGGREGATION +promising signal&more in-depth investigation&依赖 +candidate&analyst&依赖 +we&metadata-driven , collaborative candidate review framework&依赖 +timely analysis&fast transient detection candidate&AGGREGATION +software pipeline&pipeline&GENERALIZATION +framework&software pipeline&依赖 +framework&metadata processing&依赖 +facilitates browsing and inspection&available metada&AGGREGATION +current primary goal&radio astronomy&AGGREGATION +INTRODUCTION One&” ( and cordes et al. 2004 )&依赖 +INTRODUCTION One&cordes et al. 2004 )&依赖 +INTRODUCTION One&cordes et al. 2004 )&依赖 +INTRODUCTION One&” ( and cordes et al. 2004 )&依赖 +INTRODUCTION One&” ( and cordes et al. 2004 )&依赖 +INTRODUCTION One&cordes et al. 2004 )&依赖 +INTRODUCTION One¤t primary goal&AGGREGATION +scientific thrust&contrast&依赖 +scientific thrust&known source&依赖 +scientific thrust&transient event&依赖 +generate catalog&known source&AGGREGATION +scientific thrust&generate catalog&依赖 +scope and distribution&different transient source&AGGREGATION +We&different transient source&依赖 +We&scope and distribution&依赖 +“ most exciting&sources ”&依赖 +new class&sources ”&AGGREGATION +“ most exciting&original )&依赖 +“ most exciting&sources ”&依赖 +discovery&sources ”&AGGREGATION +“ most exciting&new class&依赖 +discovery&new class&AGGREGATION +“ most exciting&( italics&依赖 +Radio telescope&sky&依赖 +their&data& +detection and characterization and ” which&“ fast radio transient&AGGREGATION +detection and characterization and ” which¤t particular interest&依赖 +detection and characterization and ” which¤t particular interest&依赖 +small fraction&second&AGGREGATION +V-FASTR experiment&radio astronomy experiment&依赖 +one&new breed&AGGREGATION +new breed&radio astronomy experiment&AGGREGATION +V-FASTR experiment&transient radio signal&依赖 +V-FASTR experiment&experiment&GENERALIZATION +V-FASTR experiment&transient radio signal&依赖 +V-FASTR experiment&radio astronomy experiment&依赖 +its&instrument& +regular processing activity&host instrument&AGGREGATION +V-FASTR experiment&2 and 10 telescope&依赖 +V-FASTR experiment&National Radio Astronomy Observatory ʼs&依赖 +V-FASTR experiment&National Radio Astronomy Observatory ʼs&依赖 +2 and 10 telescope&National Radio Astronomy Observatory ʼs ( nrao ) very long baseline array ( vlba ) ( romney 2010 )&AGGREGATION +V-FASTR experiment&2 and 10 telescope&依赖 +V-FASTR experiment&( rfus ) and potentially genuine astronomical pulse&依赖 +V-FASTR experiment&configuration&依赖 +VLBA&10 25 m telescope&依赖 +V-FASTR experiment&thompson et al. 2011 )&依赖 +instance&terrestrial Radio Frequency Interference ( rfus ) and potentially genuine astronomical pulse&AGGREGATION +huge volume&entire observing session infeasible&依赖 +huge volume&full record&依赖 +huge volume&full record&依赖 +its&operation& +course&operation&AGGREGATION +full record&entire observing session infeasible&AGGREGATION +huge volume&entire observing session infeasible&依赖 +huge volume&entire observing session infeasible&依赖 +huge volume&raw time-series voltage datum&AGGREGATION +huge volume&full record&依赖 +considerable effort&noisy and often incomplete datum&依赖 +considerable effort&consequence&依赖 +considerable effort&interesting signal&依赖 +considerable effort&fine-tuning algorithm&依赖 +considerable effort&realtime identification&依赖 +realtime identification&interesting signal&AGGREGATION +considerable effort&Thompson et al. 2011&依赖 +datum&distributed V-FASTR science team&依赖 +member&distributed V-FASTR science team&AGGREGATION +datum&daily basis&依赖 +datum&member&依赖 +V-FASTR science team&science team&GENERALIZATION +practical workload constraint&candidate detection&依赖 +V-FASTR experiment&several important resource constraint&依赖 +practical workload constraint&candidate detection&依赖 +we&V-FASTR experiment&依赖 +automatic capture and organization&metada&AGGREGATION +framework&set&依赖 +set&software component&AGGREGATION +framework&candidate event&依赖 +framework&software component&依赖 +collaborative perusal and inspection&related imagery datum&AGGREGATION +rest&paper&AGGREGATION +rest&system&依赖 +rest&system&依赖 +we&more general context&依赖 +we&Section 2&依赖 +our&project& +we&project&依赖 +architectural description&system&AGGREGATION +Section 3&methodology&依赖 +evaluation&Astronomical Journal 149:23 ( 7pp )&AGGREGATION +We&evaluation&依赖 +We&Astronomical Journal&依赖 +We&149:23 ( 7pp )&依赖 +our&experience& +development&scientific data system&AGGREGATION +we&VFASTR experiment&依赖 +context&system implementation&AGGREGATION +We&object oriented data technology ( oodt ) project&依赖 +open source information integration platform&framework&依赖 +open source information integration platform¢ral role&依赖 +We&open source information integration platform&依赖 +our&framework& +we&online tool&依赖 +we&several related effort&依赖 +Principal investigator&proposal&依赖 +Principal investigator&VLBA&依赖 +V-FASTR&datum&依赖 +V-FASTR&datum&依赖 +part&routine processing&AGGREGATION +nightly list&candidate&AGGREGATION +V-FASTR&datum&依赖 +candidate list&timely basis&依赖 +candidate list&expert&依赖 +raw datum&significant disk space&依赖 +generation&sky image&AGGREGATION +their&space& +source&signal&AGGREGATION +Software tool&review process&依赖 +Software tool&candidate reviewer&依赖 +review process&process&GENERALIZATION +jpl data management systems and technologies group&software ground data system&依赖 +jpl data management systems and technologies group&software ground data system&依赖 +they&scientist&依赖 +other operational constraint&expected production environment&AGGREGATION +process&project scientist&依赖 +process&close collaboration&依赖 +understanding&processing algorithm&AGGREGATION +sense&scale and throughput requirement&AGGREGATION +broad spectrum&domain&AGGREGATION +group&data system experience&依赖 +group&diverse portfolio&依赖 +group&data system experience&依赖 +group&diverse portfolio&依赖 +group&diverse portfolio&依赖 +group&data system experience&依赖 +group&data system experience&依赖 +group&data system experience&依赖 +group&data system experience&依赖 +group&diverse portfolio&依赖 +group&diverse portfolio&依赖 +group&diverse portfolio&依赖 +diverse portfolio&data system experience&AGGREGATION +Open Source&software component&依赖 +their&design& +suite&software component&AGGREGATION +OODT1&investment&依赖 +long track record&experience&AGGREGATION +mission-specific customization&top&依赖 +realm&scientific data processing system&AGGREGATION +OODT1&individual mission data system&依赖 +OODT One&product&AGGREGATION +reusable platform&configurable component&AGGREGATION +OODT1&return&依赖 +part&NASAʼs Office&AGGREGATION +NASAʼs Office&Space Science&AGGREGATION +product&long track record&AGGREGATION +Open Source&known&依赖 +OODT&architecture&依赖 +component&one another&依赖 +component&XML-RPC2&依赖 +component&standard , open protocol&依赖 +Its&components& +2009 oodt&transition&依赖 +2009 oodt&JPL-internal development project&依赖 +2009 oodt&free and open source software project&依赖 +2009 oodt&JPL-internal development project&依赖 +2009 oodt&free and open source software project&依赖 +2009 oodt&transition&依赖 +OODT&ASF&依赖 +varied group&scientific and commercial endeavor&AGGREGATION +OODT&several public release&依赖 +several OODT component&candidate validation framework&依赖 +several OODT component&core platform&依赖 +core platform&candidate validation framework&AGGREGATION +we&Section 3&依赖 +ready availability&time and budgetary consideration&依赖 +ready availability&project&依赖 +ready availability&time and budgetary consideration&依赖 +ready availability&project&依赖 +ready availability&time and budgetary consideration&依赖 +ready availability&time and budgetary consideration&依赖 +ready availability&OODT component&AGGREGATION +their&pedigree& +ready availability&project&依赖 +ready availability&project&依赖 +several ongoing effort&online tool&依赖 +collaborative review and classification&scientific observation&AGGREGATION +we§ion&依赖 +we&several ongoing effort&依赖 +several ongoing effort&assist&依赖 +part&series&AGGREGATION +Astropulse Astropulse&sky survey&依赖 +Astropulse Astropulse&radio pulse&依赖 +University&berkeley (&AGGREGATION +series&sky survey&AGGREGATION +Astropulse Astropulse&Astropulse&GENERALIZATION +Astropulse Astropulse&series&依赖 +Astropulse project&survey&依赖 +Astropulse project&survey&依赖 +Astropulse project&project&GENERALIZATION +Astropulse project&survey&依赖 +Astropulse project&sky&依赖 +survey&sky&AGGREGATION +Astropulse project&sky&依赖 +Astropulse project&sky&依赖 +v-fastrʼs ability&greater positional accuracy&依赖 +Astropulseʼs use&excellent sensitivity&依赖 +v-fastrʼs ability&greater positional accuracy&依赖 +v-fastrʼs ability&greater positional accuracy&依赖 +v-fastrʼs ability&greater positional accuracy&依赖 +Astropulseʼs use&excellent sensitivity&依赖 +Astropulseʼs use&Areciboʼs enormous single dish telescope&AGGREGATION +variant&SETI@home project&AGGREGATION +Astropulse&same distributed , collaborative volunteer computing infrastructure&依赖 +Astropulse&same distributed , collaborative volunteer computing infrastructure&依赖 +Astropulse&same distributed , collaborative volunteer computing infrastructure&依赖 +Astropulse&same distributed , collaborative volunteer computing infrastructure&依赖 +origin&signal&AGGREGATION +number&intense transformations and calculation&AGGREGATION +intense transformations and calculation&datum&AGGREGATION +unit&computational work&AGGREGATION +use&volunteer computing&AGGREGATION +automated nature&V-FASTRʼs manual review requirement&依赖 +automated nature&approach&AGGREGATION +automated nature&V-FASTRʼs manual review requirement&依赖 +help&volunteer&AGGREGATION +large database&galaxy image&AGGREGATION +2015 january hart et al&rotation&AGGREGATION +project&date&依赖 +discovery&new interesting object&AGGREGATION +million&image&AGGREGATION +confirmation&important scientific hypothesis&AGGREGATION +formulation&new one&AGGREGATION +success&expectation&依赖 +project&success&依赖 +challenge&image classification&AGGREGATION +paradigm&v-fastr setting&依赖 +nature&archive&AGGREGATION +its&limits& +Galaxy Zoo&volunteer reviewer&依赖 +its&reviewers& +Galaxy Zoo&leisurely peruse&依赖 +University&Washington&AGGREGATION +it&“ crowd-sourced ” attempt&依赖 +protein and chain&wide range&依赖 +protein and chain&wide range&依赖 +protein and chain&human disease&依赖 +they&specific shape&依赖 +protein and chain&key role&依赖 +protein and chain&key role&依赖 +wide range&human disease&AGGREGATION +protein and chain&key role&依赖 +protein and chain&human disease&依赖 +their&function& +they&themselves&依赖 +chain&amino acid&AGGREGATION +protein and chain&human disease&依赖 +specific shape&function&依赖 +protein and chain&wide range&依赖 +researcher&challenge&依赖 +researcher&puzzle-solving capability&依赖 +researcher&challenge&依赖 +researcher&scale and complexity&依赖 +researcher&scale and complexity&依赖 +researcher&human being&依赖 +researcher&assistance&依赖 +researcher&assistance&依赖 +researcher&human being&依赖 +researcher&puzzle-solving capability&依赖 +scale and complexity&challenge&AGGREGATION +puzzle-solving capability&human being&AGGREGATION +player&arriving&依赖 +arrangement&total energy&依赖 +player&goal&依赖 +unknown and diverse strategy&human participant&AGGREGATION +its&participants& +Foldit&environment&依赖 +project&400,000 player&依赖 +implementation&metadata-driven framework&AGGREGATION +online review&V-FASTR candidate detection event&AGGREGATION +presentation&system architecture&AGGREGATION +our&methodology& +our&design& +consideration&design&依赖 +We&methodology&依赖 +their&imprint& +Development Methodology Several factor&development process&依赖 +development process&process&GENERALIZATION +need&VFASTR project&AGGREGATION +our&implementation& +aspect&design process&AGGREGATION +group&substantial experience&依赖 +group&broad range&实现 +our&group& +broad range&scientific domain&AGGREGATION +design and implementation&data system&AGGREGATION +group&design and implementation&依赖 +group&data system&实现 +close working relationship&success&依赖 +close working relationship&case&依赖 +success&project&AGGREGATION +close working relationship&project&依赖 +close working relationship&case&依赖 +close working relationship&success&依赖 +member&project science team&AGGREGATION +close working relationship&case&依赖 +close working relationship&success&依赖 +close working relationship&project&依赖 +close working relationship&project&依赖 +intuition&us&依赖 +intuition&software engineer&依赖 +our&intuitions& +intuition&software engineer&依赖 +intuition&software engineer&依赖 +intuition&us&依赖 +intuition&us&依赖 +technical challenge&system&AGGREGATION +it&V-FASTR science team&依赖 +it&member&依赖 +early identification&issue&AGGREGATION +our&communication& +member&V-FASTR science team&AGGREGATION +current system architecture&ongoing feedback loop&依赖 +current system architecture&science and software team&依赖 +direct result&ongoing feedback loop&AGGREGATION +part®ular third-party use&AGGREGATION +regular third-party use&VLBA instrument&AGGREGATION +experiment&“ guest ” status&依赖 +experiment&NRAO computing infrastructure&依赖 +experiment&“ guest ” status&依赖 +many&architectural decision&AGGREGATION +physical constraint&architectural decision&依赖 +physical constraint&many&依赖 +two type&product&AGGREGATION +data product&product&GENERALIZATION +hundred&file&AGGREGATION +V-FASTR data product&file&依赖 +V-FASTR data product&data product&GENERALIZATION +~ 800 job&~ 10 GB&AGGREGATION +storage&few week&依赖 +product&~ 10 – 20&依赖 +average rate&~ 10 – 20&AGGREGATION +product&average rate&依赖 +product&day (&依赖 +product&science team analyst&依赖 +candidate&review&依赖 +overview&average data volume&AGGREGATION +candidate&higher-resolution processing (&依赖 +core&design&AGGREGATION +network bandwidth constraint&host&AGGREGATION +network bandwidth constraint&us&依赖 +network bandwidth constraint&data transfer configuration&依赖 +complete transfer&raw , unreviewed , and possibly spurious detection datum&AGGREGATION +network bandwidth constraint&data transfer configuration&依赖 +network bandwidth constraint&us&依赖 +salient characteristic&candidate event&AGGREGATION +metada sufficient&candidate review framework&依赖 +candidate event&event&GENERALIZATION +careful selection process&beneficial side effect&依赖 +careful selection process&size&依赖 +careful selection process&allowing&依赖 +size&transferred product&AGGREGATION +longer retention period&JPL&依赖 +longer retention period&JPL&依赖 +security constraint&design&依赖 +system&two separate security domain&依赖 +system&NRAO and JPL&依赖 +security requirement&host system&AGGREGATION +functionality&possibility&依赖 +possibility&corruption&AGGREGATION +functionality&corruption&依赖 +functionality&corruption&依赖 +functionality&possibility&依赖 +software framework&framework&GENERALIZATION +we&consisting&依赖 +capture , transfer , and storage&metadata annotation&AGGREGATION +two principal component&capture , transfer , and storage&依赖 +web portal&portal&GENERALIZATION +we&software framework&依赖 +two principal component&metadata annotation&依赖 +V-FASTR data product&Metadata Pipeline&依赖 +V-FASTR data product&JPL side&依赖 +V-FASTR data product&metadata extraction and data archiving pipeline&依赖 +pipeline&three major software component&依赖 +pipeline&rsync&依赖 +Data product&NRAO staging area&依赖 +Data product&jpl server use rsync&依赖 +content&directory tree&AGGREGATION +jpl server use rsync&two server&依赖 +jpl server use rsync&minimal human intervention&依赖 +jpl server use rsync&content&依赖 +It&simplicity&依赖 +wide range&configuration option&AGGREGATION +its&simplicity& +file&compressed format&依赖 +client&product directory tree&依赖 +client&datum&依赖 +client&metada&依赖 +small subset&datum&AGGREGATION +client&small subset&依赖 +reduction&data product size&AGGREGATION +factor&3.5 � 103&AGGREGATION +average size&~ 35 GB&AGGREGATION +data product&JPL server&依赖 +OODT CAS Crawler daemon&new product&依赖 +they&OODT CAS Crawler daemon&依赖 +product directory&indicating&依赖 +named product&file manager catalog and ( 2 )&依赖 +product directory&special marker file&依赖 +no file&missing& +software component&component&GENERALIZATION +File Manager&project&依赖 +Depiction&volume estimate&依赖 +Depiction&job )&依赖 +Depiction&job )&依赖 +Depiction&volume estimate&依赖 +Depiction&volume estimate&依赖 +Depiction&job )&依赖 +Depiction&job )&依赖 +Depiction&full v-fastr datum flow&AGGREGATION +Depiction&volume estimate&依赖 +Depiction&volume estimate&依赖 +Depiction&job )&依赖 +candidate review framework&A and B&依赖 +candidate review framework&metada and derive product repository&依赖 +candidate review framework&metada and derive product repository&依赖 +candidate review framework&intersection&依赖 +intersection&A and B&AGGREGATION +candidate review framework&A and B&依赖 +candidate review framework&intersection&依赖 +rsync process&NRAO server&依赖 +Astronomical Journal 149:23 ( 7pp )&2015 january hart et al&依赖 +Astronomical Journal 149:23 ( 7pp )&2015 january hart et al&依赖 +Astronomical Journal 149:23 ( 7pp )&in-place&依赖 +Astronomical Journal 149:23 ( 7pp )&in-place&依赖 +rsync process&them&依赖 +we&OODT framework&依赖 +extensibility&OODT framework&AGGREGATION +we&extensibility&依赖 +OODT framework&framework&GENERALIZATION +three level&V-FASTR data product&依赖 +Information&job , scan , and event&依赖 +hierarchy&V-FASTR data product&AGGREGATION +Information&three level&依赖 +pair&available image&AGGREGATION +event&them&依赖 +File Manager&metada&依赖 +File Manager&back-end catalog&依赖 +File Manager&different object type&依赖 +its&catalog& +field&named key&依赖 +querying&information&AGGREGATION +decision&desire&依赖 +metada&single request&依赖 +metada&File Manager&依赖 +decision&querying&实现 +decision&web portal client&实现 +ingestion&dynamically name metadata field&AGGREGATION +second major component&candidate review framework&AGGREGATION +primary purpose&portal&AGGREGATION +location-independent perusal and assessment&potential candidate&AGGREGATION +top&Apache HTTPD Web Server&AGGREGATION +portal&Apache OODT Balance web framework&实现 +portal&running&实现 +portal&PHP web application&实现 +OODT Balance&its ability&依赖 +its&ability& +OODT Balance&integrate&依赖 +flexible , modular approach&web portal&依赖 +flexible , modular approach&framework&AGGREGATION +flexible , modular approach&web portal&依赖 +flexible , modular approach&us&依赖 +flexible , modular approach&us&依赖 +web portal&view&依赖 +web portal&available metada&依赖 +variety&view&AGGREGATION +web portal&variety&依赖 +web portal&view&依赖 +view&available metada&AGGREGATION +web portal&available metada&依赖 +web portal&variety&依赖 +job or run&multiple scan&依赖 +job or run&highest level&依赖 +layout&portal view&AGGREGATION +nature&signal&AGGREGATION +level&individual event candidate Figure 3&AGGREGATION +two graphical representation&nature&依赖 +two graphical representation&analyst&依赖 +two graphical representation&event&AGGREGATION +two graphical representation&analyst&依赖 +two graphical representation&signal&依赖 +two graphical representation&nature&依赖 +two graphical representation&signal&依赖 +part&initial candidate identification process&AGGREGATION +necessary structural clue&received signal&依赖 +product&RFI&AGGREGATION +image representation&an entire job ( many event&AGGREGATION +portal user&traditional , hierarchical navigation&依赖 +portal user&job&依赖 +metadata pipeline component&VFASTR candidate review framework&AGGREGATION +5&end-to-end framework&依赖 +5&capture , archiving&依赖 +5&end-to-end framework&依赖 +5&capture , archiving&依赖 +analysis process&process&GENERALIZATION +system&analysis process&依赖 +overall efficiency&project&AGGREGATION +online collaborative validation&fast transient candidate&AGGREGATION +candidate review framework&online collaborative validation&依赖 +we&previous section&依赖 +candidate review framework&model&依赖 +candidate review framework&fast transient candidate&依赖 +candidate review framework&fast transient candidate&依赖 +candidate review framework&model&依赖 +candidate review framework&model&依赖 +candidate review framework&online collaborative validation&依赖 +candidate review framework&fast transient candidate&依赖 +candidate review framework&online collaborative validation&依赖 +analyst&observational datum&依赖 +model&online collaborative validation&AGGREGATION +analyst&efficiency&依赖 +team&dispersed analyst&AGGREGATION +we&experience&依赖 +we&early result&依赖 +we&early result&依赖 +we&experience&依赖 +early result&experience&AGGREGATION +we&experience&依赖 +its&utility& +we&early result&依赖 +operational deployment&framework&AGGREGATION +we&early result&依赖 +we&experience&依赖 +evolution&tool&AGGREGATION +Experience The initial deployment&early summer 2012&依赖 +Experience The initial deployment&collaborative review framework&AGGREGATION +analyst&capability&依赖 +analyst&system&依赖 +capability&system&AGGREGATION +consequence&iterative feedback loop&AGGREGATION +process&updated release&依赖 +process&occurring&依赖 +first few month&deployment&AGGREGATION +part&analyst user&AGGREGATION +suggestion&addition&依赖 +number&mouse click&AGGREGATION +suggestion&various metadata field&依赖 +suggestion&various metadata field&依赖 +addition&various metadata field&AGGREGATION +element&web portal view&AGGREGATION +visual organization&element&AGGREGATION +suggestion&addition&依赖 +time&writing&AGGREGATION +V-FASTR portal&system&依赖 +V-FASTR portal&time&依赖 +we&usefulness&依赖 +V-FASTR portal&several week&依赖 +we&early conclusion&依赖 +V-FASTR portal&portal&GENERALIZATION +V-FASTR portal&writing&依赖 +we&system&依赖 +usefulness&system&AGGREGATION +V-FASTR portal&usefulness&依赖 +project&broad goal&依赖 +project&radio-transient event&依赖 +project&collaborative task&依赖 +its&goal& +their&tasks& +scientist&task&依赖 +online availability&greater flexibility&依赖 +online availability&greater flexibility&依赖 +online availability&greater flexibility&依赖 +online availability®ard&依赖 +online availability®ard&依赖 +online availability®ard&依赖 +online availability&greater flexibility&依赖 +online availability&data and metada&AGGREGATION +online availability®ard&依赖 +totality&candidate data and metada&AGGREGATION +anecdotal account&analyst&AGGREGATION +number&feature request&AGGREGATION +evolution&framework&AGGREGATION +roadmap&sort&AGGREGATION +interaction&browser&依赖 +’ activity&transition&依赖 +’ activity&transition&依赖 +interaction&browser&依赖 +science team&three key feature&依赖 +timely review&remaining& +timely review&detection candidate&AGGREGATION +review job&moment&依赖 +review job&email&依赖 +review job&analyst&依赖 +prioritized list&analystʼs outstanding review task&AGGREGATION +presentation&prioritized list&AGGREGATION +one person&analysis&依赖 +it&analysis load&依赖 +one person&fulltime job&依赖 +equitable scheduling&future review job&AGGREGATION +analysis contribution&individual user&AGGREGATION +screen shot&initial version&AGGREGATION +initial version&web portal component&AGGREGATION +image metada&full metadata listing&依赖 +image metada&individual event candidate&依赖 +portal home page&recent job&依赖 +last step&associated raw datum&依赖 +analyst&a candidate event merit&依赖 +analyst&high-resolution followup&依赖 +it&later date&依赖 +analyst&NRAO machine&依赖 +certain defined activity&portal environment&依赖 +CONCLUSION V-FASTR&V-FASTR&GENERALIZATION +CONCLUSION V-FASTR&particularly challenging experiment&依赖 +data flow&ranging&依赖 +data flow&transport mechanism&依赖 +manual review&place&依赖 +manual review&analyst&依赖 +data flow&multiple physical location&依赖 +manual review&three continent )&依赖 +Various component&millisecond and hourly&依赖 +Various component&millisecond and hourly&依赖 +Various component&system&AGGREGATION +human expert&scheduled pattern recognition engine&依赖 +human expert&own role&依赖 +data processing component&addition&依赖 +human expert&overall architecture&依赖 +their&role& +datum&system&依赖 +V-FASTR portal&critical role&依赖 +effort&Jet Propulsion Laboratory&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&Computer-Based Medical Systems&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +Interoperable Data Architecture&et al. 2011&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&acm ) , 721 – 30 mattmann , c. , freeborn , d. , crichton , d. , et al. 2009&依赖 +A Software Architecture-based Framework&Software Engineering&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&space mission challenge&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&space mission challenge&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +npp sounder peate missions ( piscataway&space mission challenge&依赖 +npp sounder peate missions ( piscataway&space mission challenge&依赖 +npp sounder peate missions ( piscataway&Information Technology&依赖 +acm ) , 36 – 42 wayth , r. ,&2015 january hart et al&依赖 +2015 january hart et al&II International Workshop&AGGREGATION +acm ) , 36 – 42 wayth , r. ,&Software Engineering&依赖 +acm ) , 36 – 42 wayth , r. ,&2015 january hart et al&依赖 +acm ) , 36 – 42 wayth , r. ,&cloud computing ( new york&依赖 +acm ) , 36 – 42 wayth , r. ,&cloud computing ( new york&依赖 +acm ) , 36 – 42 wayth , r. ,&Software Engineering&依赖 +acm ) , 36 – 42 wayth , r. ,&II International Workshop&依赖 +acm ) , 36 – 42 wayth , r. ,&II International Workshop&依赖 +acm ) , 36 – 42 wayth , r. ,&II International Workshop&依赖 +acm ) , 36 – 42 wayth , r. ,&II International Workshop&依赖 +acm ) , 36 – 42 wayth , r. ,&cloud computing ( new york&依赖 +acm ) , 36 – 42 wayth , r. ,&2015 january hart et al&依赖 +acm ) , 36 – 42 wayth , r. ,&2015 january hart et al&依赖 +acm ) , 36 – 42 wayth , r. ,&Software Engineering&依赖 +acm ) , 36 – 42 wayth , r. ,&Software Engineering&依赖 +acm ) , 36 – 42 wayth , r. ,&cloud computing ( new york&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt index c92ae4773b74b21b9aa7b209b085a9f0f86cb88b..05c9aae02d64594a7f4889a91097c3dedad909d9 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-simEnts.txt @@ -1,637 +1,679 @@ -A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE -STREAMS: THE V-FASTR EXPERIMENT AS A CASE STUDY -Andrew F. Hart, Luca Cinquini, Shakeh E. Khudikyan, David R. Thompson, -Chris A. Mattmann, Kiri Wagstaff, Joseph Lazio, and Dayton Jones -Jet Propulsion Laboratory, California Institute of Technology, Pasadena, CA 91109, USA; andrew.f.hart@jpl.nasa.gov -Received 2014 March 24; accepted 2014 August 10; published 2014 December 16 -ABSTRACT -“Fast radio transients” are defined here as bright millisecond pulses of radio-frequency energy. These shortduration -pulses can be produced by known objects such as pulsars or potentially by more exotic objects such as -evaporating black holes. The identification and verification of such an event would be of great scientific value. This -is one major goal of the Very Long Baseline Array (VLBA) Fast Transient Experiment (V-FASTR), a softwarebased -detection system installed at the VLBA. V-FASTR uses a “commensal” (piggy-back) approach, analyzing -all array data continually during routine VLBA observations and identifying candidate fast transient events. Raw -data can be stored from a buffer memory, which enables a comprehensive off-line analysis. This is invaluable for -validating the astrophysical origin of any detection. Candidates discovered by the automatic system must be -reviewed each day by analysts to identify any promising signals that warrant a more in-depth investigation. To -support the timely analysis of fast transient detection candidates by V-FASTR scientists, we have developed a -metadata-driven, collaborative candidate review framework. The framework consists of a software pipeline for -metadata processing composed of both open source software components and project-specific code written -expressly to extract and catalog metadata from the incoming V-FASTR data products, and a web-based data portal -that facilitates browsing and inspection of the available metadata for candidate events extracted from the VLBA -radio data. -Key words: catalogs – methods: data analysis – pulsars: general – radio continuum: general -1. INTRODUCTION -One of the current primary goals of radio astronomy is to -explore and understand the “dynamic radio sky” (Cordes -et al. 2004). In contrast to generating catalogs of known -sources, this scientific thrust focuses on transient events, or -transient signals generated by persistent yet time-varying -sources. We do not yet fully understand the scope and -distribution of different transient sources, which range from -the known (e.g., active galactic nuclei, brown dwarfs, flare -stars, X-ray binaries, supernovae, gamma-ray bursts) to the -probable (e.g., exoplanets), to the possible (e.g., ET -civilizations, annihilating black holes). As noted by Cordes -et al. (2004, p.14), “most exciting would be the discovery of -new classes of sources” (italics in original). Radio telescopes -continue to increase their data collecting abilities, observing -the sky with progressively finer time resolution. Of current -particular interest is the detection and characterization of -“fast radio transients,” which last for only small fractions of a -second. -The V-FASTR experiment (Wayth et al. 2011) is one of a -new breed of radio astronomy experiments specifically -targeting fast transient radio signals. The experiment is -conducted in a fully commensal (passive) fashion, searching -for signals in the data gathered during the regular processing -activities of its host instrument. Unlike more traditional, -single-telescope observations, however, the V-FASTR -experiment simultaneously utilizes anywhere between 2 and -10 telescopes of the National Radio Astronomy Observatory -ʼs (NRAO) Very Long Baseline Array (VLBA) (Romney -2010). The VLBA consists of 10 25 m telescopes that are -positioned geographically such that no 2 are within each -otherʼs local horizon, and the V-FASTR experiment -leverages this configuration to better discriminate between -instances of terrestrial Radio Frequency Interference (RFI) -and potentially genuine astronomical pulses (Thompson -et al. 2011). -The huge volumes of raw time-series voltage data generated -by the VLBA in the course of its operation make storing the -full record of an entire observing session infeasible at the -present time. As a consequence, considerable effort has been -devoted to developing and fine-tuning algorithms for the realtime -identification of potentially interesting signals in the noisy -and often incomplete data (Thompson et al. 2011; Wayth et al. -2012). All data selected by the real-time processing step is -subsequently inspected, on a daily basis, by members of the -geographically distributed V-FASTR science team and either -discarded as spurious or archived offline for full analysis at a -later date. -The V-FASTR experiment must therefore operate within -several important resource constraints: the inability to archive -the full observational record due to space constraints, and a -practical workload constraint upon the human analysts -reviewing candidate detections. To address the latter, we have -developed a metadata-driven, collaborative candidate review -framework for the V-FASTR experiment. The framework -comprises a set of software components dedicated to the -automatic capture and organization of metadata describing the -candidate events identified as interesting by the automated -algorithms, and an online environment for the collaborative -perusal and inspection of related imagery data by the V-FASTR -analysis team. -The rest of this paper describes the system as follows. In -Section 2 we describe our project in a more general context. -Section 3 presents the methodology and an architectural -description of the system. We follow with an evaluation of -The Astronomical Journal, 149:23 (7pp), 2015 January doi:10.1088/0004-6256/149/1/23 -© 2015. The American Astronomical Society. All rights reserved. -1 -our experience deploying the framework in Section 4, and -offer conclusions and future directions for the work in -Section 5. -2. BACKGROUND -To better understand the context of the system implementation -presented in Section 3, we first briefly introduce the VFASTR -experiment and describe the development of scientific -data systems at the NASA Jet Propulsion Laboratory (JPL). -We then describe the Object Oriented Data Technology -(OODT) project, an open source information integration -platform that plays a central role in our framework. Finally, -we briefly touch upon several related efforts at developing -online tools to collaboratively classify and validate scientific -observations. -2.1. V-FASTR: The VLBA Fast TRansients Experiment -V-FASTR (VLBA Fast TRansients) is a data analysis -system used by the VLBA to detect candidate fast transient -events. Principal investigators submit observing proposals to -the VLBA targeted at galaxies, supernovae, quasars, pulsars, -and more. V-FASTR analyzes all data collected by the VLBA -as part of routine processing and produces a nightly list of -candidates identified within the data processed that day. The -raw data for each candidate is temporarily saved in case it is -needed to interpret or follow up on a particularly promising or -unusual detection. However, the raw data consumes significant -disk space and therefore the candidate list must be reviewed on -a timely basis by experts. False positives can be deleted and -their disk space reclaimed, while truly interesting events can be -re-processed to enable the generation of a sky image to localize -the source of the signal. Software tools that streamline and -simplify this review process are therefore highly valued by -candidate reviewers and can have a positive impact on other -similar efforts throughout the world. -2.2. Data System Development at JPL -The Data Management Systems and Technologies group at -the JPL develops software ground data systems to support -NASA science missions. These pipelines are specifically -optimized to support the data-intensive and computationallyintensive -processing steps often needed to convert raw remotesensing -observations into higher level data products at scale so -that they can be interpreted by the scientists. The process -almost always involves developing a close collaboration with -project scientists to obtain an understanding of the processing -algorithms involved, a sense of the scale and throughput -requirements, and other operational constraints of the expected -production environment. -Over the years the group has developed a diverse portfolio of -data system experience across a broad spectrum of domains -including earth and climate science (Mattmann et al. 2009; -Hart et al. 2011; Tran et al. 2011), planetary science, -astrophysics, snow hydrology, radio astronomy, cancer -research (Crichton et al. 2001), and pediatric intensive care -(Crichton et al. 2011). -2.3. Open Source and OODT -One of the products of this long track record of experience in -the realm of scientific data processing systems is a suite of -software components known as OODT1 originally arose out of -a desire on the part of NASAʼs Office of Space Science to -improve the return on investment for individual mission data -systems by leveraging commonalities in their design to create a -reusable platform of configurable components, on top of which -mission-specific customizations could be made. OODT thus -represents both an architecture and a reference implementation. -Its components communicate with one another over standard, -open protocols such as XML-RPC2 and can be used either -individually, or coupled together to form more complex data -processing pipelines. -In 2009 OODT began the transition from a JPL-internal -development project to a free and open source software project -at the Apache Software Foundation (ASF).3 Graduating to a -top-level project in 2011, OODT has since undergone several -public releases at the ASF and is in use by a varied group of -scientific and commercial endeavors. As we will describe -further in Section 3, several OODT components form the core -platform of our candidate validation framework. The ready -availability of OODT components under a liberal license, -combined with their substantial pedigree was appealing to our -project both for time and budgetary considerations. -2.4. Related Work -In the following section we identify several ongoing efforts -that also utilize online tools to assist in the collaborative review -and classification of scientific observations. -2.4.1. Astropulse -Astropulse is part of a series of sky surveys for radio pulses -being conducted by the Search for Extraterrestrial Intelligence -(SETI) at the University of Berkeley (Siemion et al. 2010). -The Astropulse project conducts a survey of the sky from the -Arecibo Observatory in Puerto Rico, searching for short -(microsecond) broadband radio frequency pulses. While -Astropulseʼs use of Areciboʼs enormous single dish telescope -affords excellent sensitivity, V-FASTRʼs ability to perform -continent-scale baseline interferometery yields much greater -positional accuracy when attempting to localize the source of a -signal. -As a variant of the SETI@home project, Astropulse utilizes the -same distributed, collaborative volunteer computing infrastructure -accumulated over the years by that effort to perform a -number of computationally intense transformations and calculations -of the data in an attempt to better classify the origin of any -signals detected. The use of volunteer computing to perform units -of computational work is an appealing approach that obviates the -need to directly acquire sufficient hardware for the processing -demands. However, the fully automated nature of the approach is -not a natural fit for V-FASTRʼs manual review requirement. -2.4.2. Galaxy Zoo -GalaxyZoo4 is an Internet-based project that relies on the -help of volunteers to classify a very large database of galaxy -images recorded by either the Sloan Digital Sky Survey or the -Hubble telescope. Users are asked to classify galaxies based on -1 Apache OODT: http://oodt.apache.org/ -2 XML-RPC: http://xmlrpc.scripting.com/spec.html -3 http://apache.org/ -4 Galaxy Zoo: http://www.galaxyzoo.org/ -2 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -shape, color and direction of rotation, and report on possible -unidentified features. The rationale behind human intervention -is that manual classification is more accurate and insightful -than any algorithm that can currently by undertaken by an -automatic program. To date, the project has met with success -that far exceeded expectations: more than 250,000 volunteers -have helped classify millions of images, resulting in the -confirmation of important scientific hypothesis, the formulation -of new ones, and the discovery of new interesting objects. -While Galaxy Zooʼs tactic of appealing to volunteers to -mitigate the challenge of image classification at scale is -attractive, the paradigm does not translate well to the V-FASTR -setting due to differences in the nature of the archives between -the two projects. Whereas Galaxy Zoo permits its volunteer -reviewers to leisurely peruse and mine a largely static image -archive, the rapidly growing data volumes associated with -ongoing V-FASTR observations dictate that reviews must be -regularly scheduled to keep the project within its resource -limits. -2.4.3. Foldit: The Protein Folding Game -Foldit (Cooper et al. 2010) is a collaborative online protein -folding game developed by the Center for Game Science -at the University of Washington, and it represents a -“crowd-sourced” attempt to solve the computationally challenging -task of predicting protein structure. Proteins, chains of -amino acids, play a key role in a wide range of human diseases, -but comparatively little is known about how they contort -themselves into the specific shapes that determine their -function. Because of the scale and complexity of the challenge, -the researchers behind Foldit have turned to the puzzle-solving -capabilities of human beings for assistance. After learning the -rules on simple challenges, players compete against one -another to design alternative protein structures, with the goal -of arriving at an arrangement that minimizes the total energy -needed to maintain the shape. -Foldit has created an environment in which the unknown and -diverse strategies of its human participants become a core -strength. Furthermore, by presenting the scientific activity as a -competitive game, the project, which currently boasts over -400,000 players, has shown that it is possible to recruit and -leverage human processing power at scale. This provides an -interesting model for other projects, including V-FASTR, -which at some point may rely upon a human element to -augment or improve automated processes. -3. IMPLEMENTATION -In this section we provide details on the implementation of -our metadata-driven framework for online review of V-FASTR -candidate detection events. We describe our methodology and -the considerations that informed our design, followed by a -presentation of the system architecture. -3.1. Development Methodology -Several factors influenced the development process and have -left their imprint on the final architecture. We feel that our -implementation is uniquely suited to the needs of the VFASTR -project precisely because these factors were identified -early on and were thus able to influence all aspects of the -design process. -3.1.1. Collaboration -As described in Section 2, our group has developed -substantial experience in the design and implementation of -data systems for a broad range of scientific domains. In each -case, a close working relationship with members of the project -science team was an essential ingredient to the success of the -project, and our experience developing an online candidate -review framework for V-FASTR was no different. As software -engineers familiar with the challenges inherent in scientific data -management, our intuitions about the technical challenges of -the system served us well in scoping out the project timeline. -However, it was our early and regular communication with -members of the V-FASTR science team that was critical to -obtaining the domain knowledge necessary to make accurate -assumptions, and in the early identification of issues. The -current system architecture, covering both the back and front -end elements, is a direct result of an ongoing feedback loop -between the science and software teams. -3.1.2. Constraints -As mentioned in Section 2, V-FASTR is a commensal -experiment that scans for fast transients in data that is already -being collected as part of the regular third-party use of the -VLBA instrument. As such, the experiment maintains a “guest” -status on the NRAO computing infrastructure. Consequently, -care must consistently be taken not to overtax NRAO system -resources, including disk storage, CPU time, and network -bandwidth. These physical constraints motivated many of the -architectural decisions described in the following sections. -Each V-FASTR data product may contain hundreds of files, -rooted at a top-level job directory, and includes two types of -products: filterbank data (up to ~100 GB per job) and -baseband voltage data (up to ~10 GB per job). The total data -storage capacity available to V-FASTR is just ~8 TB, enough -to contain ~800 jobs of ~10 GB each (on average). Because -products are produced at a average rate of ~10–20 per day (but -sometimes in the hundreds), the storage would be exhausted -within a few weeks unless products are periodically reviewed -by the science team analysts. During review, each candidate is -either flagged for higher-resolution processing (and saved) or -discarded as a false positive and the disk space reclaimed (see -Figure 1 for an overview of the average data volumes per job at -different processing stages). The desire to provide analysts with -a streamlined method for this review process is at the very core -of our design. -Similarly, the network bandwidth constraints of the host led -us to a data transfer configuration that focused on metadata -rather than requiring the complete transfer of raw, unreviewed, -and possibly spurious detection data over the Internet. Instead, -metadata sufficient to describe the salient characteristics of a -candidate event to a trained analyst was transferred into our -candidate review framework. This careful selection process had -the beneficial side effect of greatly limiting the size of the -transferred products, allowing for a considerably longer -retention period on the ~10 TB archive hosted at JPL. -Finally, security constraints were also critically important to -the design, particularly because the system spans two separate -security domains: NRAO and JPL. To comply with the security -requirements of the host system, data transfer was configured -on the NRAO system to allow read-only operations and was -made accessible only to clients originating from the JPL -3 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -domain. Furthermore, on the front-end, the functionality -exposed by the web portal component interacted only with -the local metadata archive, eliminating the possibility of -corruption or inappropriate access to the raw observational -data. -3.2. Architecture -As previously mentioned, the candidate review framework is -driven by metadata describing the candidate events to be -reviewed by V-FASTR analysts. To communicate this data -from the raw source repository at the NRAO to an analyst using -a browser anywhere in the world, we developed a software -framework consisting of two principal components: a metadata -pipeline that manages the capture, transfer, and storage of -metadata annotations, and a web portal which provides analysts -with a convenient, context-rich environment for efficiently -classifying candidate events. -3.2.1. Metadata Pipeline -On the JPL side, the V-FASTR data products are processed -through a metadata extraction and data archiving pipeline that -eventually leads to the event candidates being available for -inspection on the web portal. The pipeline is composed of three -major software components: rsync, the OODT CAS Crawler, -and the OODT File Manager, depicted in Figure 2. -rsync. Data products are automatically transferred from the -NRAO staging area to the JPL server using rsync. rsync is a -popular application and data transfer protocol that allows to -synchronize the content of a directory tree between two -servers with minimal human intervention. It was chosen -because of its simplicity, high performance, reliability, and -wide range of configuration options. Through rsync, files are -transferred in compressed format and using delta encoding, -meaning that only the file differences are sent through -subsequent transfers. For this project, an rsync server -daemon was set up on the NRAO side to expose the data -staging area where the products are collected. For security -reasons, the daemon was restricted to allow read-only -operations to clients originating from a designated JPL IP -address. On the JPL side, an rsync client was set up to run -hourly as a system cron job, transferring products to the JPL -archive area. To minimize bandwidth usage, the client only -transfers a very small subset of the data comprising a product -directory tree, namely the detection images and the output -and calibration files containing the metadata needed by the -web portal. On average, this represents a reduction of the -data product size by a factor of 3.5 ´ 103: from an average -size of ~35 GB on the NRAO server (for a product with -several detections), to ~10 MB on the JPL server. The rsync -data transfer rates between the two servers were measured to -be around ~2 MBs-1, more than enough to transfer between -10 and 20 data products per day. -CAS Crawler. Once the data products are transferred to the -JPL server, they are automatically detected by the OODT -CAS Crawler daemon, which runs at sub-hour time intervals -to pick up new products as soon as they become available. -The Crawler is responsible for notifying the OODT File -Manager and therefore starting the product ingestion process. -For this deployment, the Crawler was configured to send a -signal only if two preconditions are both satisfied: (1) a -similarly named product does not already exist in the File -Manager catalog and (2) the product directory contains a -special marker file indicating that the product has been -processed by the mail program, and therefore is in a complete -state (i.e., no files are missing). -CAS File Manager. The OODT CAS File Manager is a -customizable software component that is responsible for -processing and archiving a data product, making it available -for query and access to clients. For this project, the File -Manager was deployed with the default Apache Lucene -metadata back-end, and configured to archive products -Figure 1. Depiction of the full V-FASTR data flow with volume estimates (per job) at each stage. The candidate review framework (both metadata pipeline and web -portal components) interact with the metadata and derived products repository at the intersection of A and B above. -4 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -in-place, i.e., without moving them to a separate archive -directory, otherwise the rsync process would transfer them -again from the NRAO server. Additionally, we leveraged the -extensibility of the OODT framework by configuring the File -Manager with custom metadata extractors that were -purposely written to parse the information contained in the -V-FASTR output and calibration files. Information is -extracted at the three levels that comprise the hierarchy of -a V-FASTR data product: job, scan, and event. Additionally, -a numerical algorithm was written to assign each pair of -available images (-det.jpg and -dedisp.jpg) to the event that -generated them. -In general, a File Manager can store metadata in its back-end -catalog as different object types. Each object type is defined to -contain multiple metadata fields, where each field is composed -of a named key associated to one or more string values. For this -project, the decision was made to maintain a one-to-one -correspondence between a data product and the corresponding -metadata ingested into the catalog. So rather than defining three -object types for jobs, scans, and events, a single object type -was used holding all information for a data product in a single -container, with dynamically named keys that are encoded to -contain the scan and event numbers. This decision was -motivated by the desire to simplify and optimize the querying -of information by the web portal client, since all metadata for a -product is retrieved through a single request to the File -Manager. As a consequence, the default Apache Lucene -metadata catalog implementation had to be slightly modified -to allow for the ingestion of dynamically named metadata -fields. -3.2.2. Web Portal -The second major component of the candidate review -framework is an interactive web portal. The primary purpose -of the portal is to provide a convenient online environment for -the location-independent perusal and assessment of potential -candidates in context. The portal provides V-FASTR analysts -with the ability to quickly navigate through the available -information to identify candidates worthy of further inspection -on a familiar web platform. -The portal has been implemented as a PHP web application -using the Apache OODT Balance web framework running on -top of the Apache HTTPD Web Server. OODT Balance was -chosen here for its ability to easily integrate with the OODT -components in the back-end metadata pipeline, namely the -OODT CAS File Manager described earlier. Furthermore, the -flexible, modular approach of the framework allowed us to -quickly connect the web portal to the metadata repository and -rapidly begin constructing the necessary views specific to the -V-FASTR candidate review and validation use cases. -As Figure 3 shows, the web portal offers a variety of views -of the available metadata which are hierarchically organized to -match the conceptual relationships in the data. At the highest -level, a job or run might consist of multiple scans, each of -which may itself contain multiple detection event candidates. -This hierarchy, expressed in the metadata, is preserved in the -layout of the portal views, and the breadcrumb navigation -provided to facilitate orientation within the nested structure. -At the level of an individual event candidate (Figure 3, -middle image), two graphical representations of the event are -available to assist analysts in classifying the nature of the -signal. These images are generated automatically as part of the -initial candidate identification process (Wayth et al. 2011), and -they provide a trained analyst the necessary structural clues -needed to rapidly assess the received signal as being genuinely -extraterrestrial in origin or merely a product of RFI. -To support both metadata browsing in context and the desire -for an analyst to be able to rapidly peruse the image -representations of an entire job (many events in many scans) -at once, a compromise was struck whereby, for each job, a -portal user may select a traditional, hierarchical navigation or a -flattened view in which all of the (possibly numerous) event -candidates are presented simultaneously on screen and can be -accessed serially simply by scrolling the view. -Figure 2. Component diagram for the metadata pipeline component of the VFASTR -candidate review framework. -5 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -Together, the metadata pipeline and the web portal constitute -an end-to-end framework for capturing, archiving, and -presenting metadata about detected transient event candidates -to V-FASTR scientists. Furthermore, by providing a reliable -process and flexible interface, the system directly streamlines -the analysis process, boosting the overall efficiency of the -project. -4. EVALUATION -As we have described in the previous section, the candidate -review framework embraces the model of online collaborative -validation of fast transient candidates by a team of geographically -dispersed analysts, and improves the efficiency with -which analysts may classify observational data. In this section -we describe the early results of our experience with the -operational deployment of the framework, as well as highlight -several areas for the evolution of the tool to further enhance its -utility. -4.1. Experience -The initial deployment of the collaborative review framework -for operational use by the V-FASTR science team was -made in early summer 2012. The immediate feedback was -largely positive: analysts praised the capabilities of the system, -the general improved accessibility afforded by a web-based -user interface, and the newfound capability to easily navigate -rapidly through all detections in a given job, or peruse the -different levels (scans and events) within a job individually. -The biggest initial complaint with the system was that too -many mouse clicks were required to complete an analysis of all -of the candidates in an entire job. -A consequence of the iterative feedback loop that developed -between the software and science teams (described further in -Section 3) was that suggestions for improvements were -repeatedly made, tracked, and acted upon. This process resulted -in an updated release occurring approximately every two weeks -during the first few months of the deployment. Suggestions for -improvements included the addition of various metadata fields -identified as critical to the classification task, updates to the -visual organization of the elements of the web portal views, and -a relentless focus on reducing the number of mouse clicks -required on the part of analyst users. -By the time of this writing, the V-FASTR portal has been -running operationally for several weeks, and we can draw some -early conclusions on usefulness of the system. Overall, as -reported by the science team, it seems like the project has -definitely accomplished its broad goal of facilitating the -collaborative task of inspecting and screening radio-transient -events. By extracting all relevant metadata from the latest data -products, and presenting it on the web portal in a concise -fashion, scientists can now execute their tasks more efficiently, -compared to earlier times when they had to log onto a terminal -and analyze the raw data manually. Additionally, the online -availability of all data and metadata through a browser interface -(as opposed to an ssh terminal) has allowed for greater -flexibility with regard to when and where evaluations can be -performed, including for the first time on a mobile device. -4.2. Evolution -On the whole, the ability to interact with the totality of the -candidate data and metadata through a browser interface has -greatly expanded the analysts’ ability to perform their tasks -with greater flexibility regarding when and where evaluations -can be performed. This includes, for the first time, anecdotal -accounts of an analyst reviewing candidates from a mobile -device. -With this freedom, in turn, has come a number of feature -requests which can be taken together to form a roadmap of -sorts for the evolution of the framework. Now that the -interaction with candidate metadata has transitioned to the -browser, the science team has identified three key features they -feel would complete the transition and entirely replace the prior -ad-hoc methods for coordinating the analysts’ activities: -Job assignment. As mentioned in Section 3, the timely -review of detection candidates is critical to remaining within -the resource constraints imposed upon the experiment. At the -moment, review jobs are assigned to analysts via email. -Augmenting the web portal with the ability to identify an -individual analyst would enable the presentation of -a prioritized list of that analystʼs outstanding review tasks. -Effort Tracking. Along the same lines, it is important to -spread the analysis load evenly across the science team, since -no one person is performing the analysis as his or her fulltime -job. Augmenting the framework with the ability to track -the analysis contributions of individual users over time -would assist in the equitable scheduling of future -review jobs. -Figure 3. Screen shots of the initial version of the web portal component. From left to right: the portal home page displaying recent jobs and associated event counts, -image metadata associated with an individual event candidate, full metadata listing, including associated scans, for an observation job. -6 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. -In-browser archiving. When an analyst determines a -candidate event merits high-resolution followup, the last -step is to archive the associated raw data so that it can be -evaluated at a later date. Currently, due to the security -restrictions permitting read-only access to external connections -to the archive at the NRAO (described in Section 3), -this process is handled out-of-band by the analyst logging -into an NRAO machine and archiving the appropriate data -manually. It is possible that, with the identity management -features discussed in the previous two items (and the -associated auditing capabilities that it could entail) the -restrictions might be negotiated to the point that certain -defined activities (such as archiving a single job directory) -could be initiated from within the portal environment. -5. CONCLUSION -V-FASTR, and commensal operations more generally, are -particularly challenging experiments due to extreme data -volume and real-time requirements. Processing occurs continually, -and the data flow must be coordinated across multiple -physical locations with transport mechanisms ranging from -FedEx transport (disks from the antenna), high-bandwidth -interconnects (the correlator and transient detection systems), -daily rsync over IP (the ska-dc mirror), and distributed WWW -protocols (manual review which takes place by analysts on -three continents). Various components of the system operate on -millisecond, hourly, and daily clocks and all components must -continue operating since there is very little margin for buffer -resources. In addition, the data processing components are -highly heterogeneous, with human experts playing their own -role as scheduled pattern recognition engines in the overall -architecture. By facilitating timely review, and reducing the -learning curve for new reviewers, the V-FASTR portal will -play a critical role in keeping the data flowing and making the -system sustainable in the long term. -This effort was supported by the Jet Propulsion Laboratory, -managed by the California Institute of Technology under a -contract with the National Aeronautics and Space -Administration. -REFERENCES -Cooper, S., Khatlib, F., & Treuille, A. 2010, Natur, 466, 756–60 -Cordes, J., Lazio, T., & McLaughlin, M. 2004, NewAR, 48, 1459–72 -Crichton, D., Kincaid, H., Downing, G., Srivastava, S., & Hughes, J. S. 2001, -in Proc. of the 14th IEEE Symp. on Computer-Based Medical Systems, An -Interoperable Data Architecture for Data Exchange in a Biomedical -Research Network (Piscataway, NJ: IEEE), 65–72 -Crichton, D., Mattmann, C., Hart, A., et al. 2011, in Proc. of the 24th -IEEE Symp. on Computer-Based Medical Systems An Informatics -Architecture for the Virtual Pediatric Intensive Care Unit (Piscataway, -NJ: IEEE), 1–6 -Hart, A., Goodale, C., Mattmann, C., et al. 2011, in Proc. of the 2nd Int. -Workshop on Software Engineering for Cloud Computing, A Cloudenabled -Regional Climate Model Evaluation System (New York: -ACM), 43–49 -Mattmann, C., Crichton, D., Medvidivic, N., & Hughes, J. S. 2006, in Proc. -2006 Int. Conf. on Software Engineering, A Software Architecture-based -Framework for Highly Distributed and Data Intensive Scientific -Applications (New York: ACM), 721–30 -Mattmann, C., Freeborn, D., Crichton, D., et al. 2009, in Proc. IEEE Int. Conf. -on Space Mission Challenges for Information Technology, A Reusable -Process Control System Framework for the Orbiting Carbon Observatory -and NPP Sounder PEATE Missions (Piscataway, NJ: IEEE), 165–72 -Romney, J. D. 2010, NRAO, http://www.vlba.nrao.edu/astro/obstatus/current/ -obssum.html. -Siemion, A., von Korff, J., McMahon, P., Korpela, E., & Werthimer, D. 2010, -AcAau, 67, 1342–9 -Thompson, D., Wagstaff, K., Brisken, W., et al. 2011, ApJ, 735, 98 -Tran, J., Cinquini, L., Mattmann, C., et al. 2011, in Evaluating Cloud -Computing in the NASA DESDynI Ground Data System Proc. of the II -International Workshop on Software Engineering for Cloud Computing -(New York: ACM), 36–42 -Wayth, R., Brisken, W., Deller, A., et al. 2011, ApJ, 735, 97 -Wayth, R., Tingay, S., & Deller, A. 2012, ApJL, 753, L36 -7 -The Astronomical Journal, 149:23 (7pp), 2015 January Hart et al. \ No newline at end of file +structs,structs +BooleanQueryCriteria,BooleanQueryCriteria +Element,Element +ExtractorSpec,ExtractorSpec +ExtractorSpec,spicified Extractor extractors +ExtractorSpec,MimeTypeExtractor +ExtractorSpec,CoreMetExtractor +ExtractorSpec,JobSpec +FileTransferStatus,FileTransferStatus +FileTransferStatus,File Manager extension points +FileTransferStatus,FileManagerDashboardNulled +FileTransferStatus,DevExtremeJavaScriptFile +FileTransferStatus,File Manager UI component +FileTransferStatus,File Manager server interface +FileTransferStatus,File Manager Command class +FileTransferStatus,File Manager web interface +FileTransferStatus,FileManagerAppUI +FileTransferStatus,File content +FileTransferStatus,JSXFile +FileTransferStatus,File Manager file System Provider +FileTransferStatus,File type +FileTransferStatus,File information +FreeTextQueryCriteria,FreeTextQueryCriteria +FreeTextQueryCriteria,FreePastry +FreeTextQueryCriteria,implementation Free +Product,Product +ProductPage,ProductPage +ProductPage,Product structures +ProductPage,ProductMetadata +ProductPage,ProductGenerationExecutives +ProductPage,ProductPctTransferred +ProductPage,ProductId +ProductPage,FileManagerPage +ProductPage,ProductTypes +ProductPage,ProductStructure +ProductPage,Product operation +ProductPage,Product instance information +ProductPage,Product listing pages +ProductType,ProductType +ProductType,Product structures +ProductType,ProductMetadata +ProductType,Type Script script +ProductType,ProductGenerationExecutives +ProductType,ProductPctTransferred +ProductType,ProductId +ProductType,ProductTypes +ProductType,ProductStructure +ProductType,Product operation +ProductType,Product instance information +ProductType,product Type +ProductType,Product listing pages +Query,Query +QueryCriteria,QueryCriteria +RangeQueryCriteria,RangeQueryCriteria +Reference,Reference +TermQueryCriteria,TermQueryCriteria +TestProduct,TestProduct +TestReference,TestReference +System,System +XmlRpcFileManager,XmlRpcFileManager +XmlRpcFileManager,import File Manager +XmlRpcFileManager,OpenFileManager +XmlRpcFileManager,XmlRpcFileManagerClient +XmlRpcFileManager,TrilloFileManager +XmlRpcFileManager,KendoUIFileManager +XmlRpcFileManager,Manager The Repository Manager extension +XmlRpcFileManager,ExtremeJavaScriptFileManager +XmlRpcFileManager,RepositoryManagerTheRepositoryManager +XmlRpcFileManager,CASFileManager +XmlRpcFileManager,Xml Rpc File Manager complex +XmlRpcFileManager,Manager extension point +XmlRpcFileManager,WebixFileManager +XmlRpcFileManager,bs4 File Manager +XmlRpcFileManager,ApacheOODTFileManager +XmlRpcFileManagerClient,XmlRpcFileManagerClient +XmlRpcFileManagerClient,Client implementation +XmlRpcFileManagerClient,Client React connector +XmlRpcFileManagerClient,Client React component +XmlRpcFileManagerClient,Client side metadata +XmlRpcFileManagerClient,MahasenClient +Dispatcher,Dispatcher +Result,Result +SecureWebServer,SecureWebServer +SecureWebServer,ApacheHTTPDWebServer +SecureWebServer,Server Node API v1 Localization +SecureWebServer,WindowsServer +Repository,Repository +RepositoryManager,RepositoryManager +RepositoryManager,SoftwarePackageManager +RepositoryManager,TaskManager +RepositoryManager,import File Manager +RepositoryManager,OpenFileManager +RepositoryManager,NETCoreFileManager +RepositoryManager,OODTCASFileManager +RepositoryManager,NodePackageManager +RepositoryManager,TrilloFileManager +RepositoryManager,KendoUIFileManager +RepositoryManager,WordPressFileManager +RepositoryManager,Manager The Repository Manager extension +RepositoryManager,ManagerIngestUseCase +RepositoryManager,ExtremeJavaScriptFileManager +RepositoryManager,Repository Manager extension point +RepositoryManager,RepositoryManagerTheRepositoryManager +RepositoryManager,RepositoryManagerTheRepositoryManager +RepositoryManager,CASFileManager +RepositoryManager,ManagerObjectModel +RepositoryManager,demand Library Manager +RepositoryManager,DeployTrilloFileManager +RepositoryManager,Manager extension point +RepositoryManager,WebixFileManager +RepositoryManager,bs4 File Manager +RepositoryManager,ApacheOODTFileManager +RepositoryManagerFactory,RepositoryManagerFactory +RepositoryManagerFactory,Repository Manager extension point +RepositoryManagerFactory,RepositoryManagerTheRepositoryManager +DataSourceRepositoryManager,DataSourceRepositoryManager +DataSourceRepositoryManager,SoftwarePackageManager +DataSourceRepositoryManager,Data nodes +DataSourceRepositoryManager,Manager control +DataSourceRepositoryManager,TaskManager +DataSourceRepositoryManager,import File Manager +DataSourceRepositoryManager,OpenFileManager +DataSourceRepositoryManager,NETCoreFileManager +DataSourceRepositoryManager,OODTCASFileManager +DataSourceRepositoryManager,NodePackageManager +DataSourceRepositoryManager,MoneydanceFinancialData +DataSourceRepositoryManager,Data Transfer interface +DataSourceRepositoryManager,Manager The Repository Manager extension +DataSourceRepositoryManager,Data products +DataSourceRepositoryManager,ExtremeJavaScriptFileManager +DataSourceRepositoryManager,ManagerArchitecture +DataSourceRepositoryManager,Data registry +DataSourceRepositoryManager,Manager catalog +DataSourceRepositoryManager,RepositoryManagerTheRepositoryManager +DataSourceRepositoryManager,CASFileManager +DataSourceRepositoryManager,ManagerObjectModel +DataSourceRepositoryManager,demand Library Manager +DataSourceRepositoryManager,Manager extension point +DataSourceRepositoryManager,WebixFileManager +DataSourceRepositoryManager,DataManagementSystems +DataSourceRepositoryManager,Data volume +DataSourceRepositoryManager,DataSystemDevelopment +DataSourceRepositoryManagerFactory,DataSourceRepositoryManagerFactory +DataSourceRepositoryManagerFactory,Data nodes +DataSourceRepositoryManagerFactory,MoneydanceFinancialData +DataSourceRepositoryManagerFactory,Data Transfer extension point +DataSourceRepositoryManagerFactory,Data Transfer interface +DataSourceRepositoryManagerFactory,Data products +DataSourceRepositoryManagerFactory,Data registry +DataSourceRepositoryManagerFactory,DataGridManagementSystem +DataSourceRepositoryManagerFactory,DataManagementSystems +DataSourceRepositoryManagerFactory,Data volume +DataSourceRepositoryManagerFactory,DataSystemDevelopment +ScienceDataRepositoryManager,ScienceDataRepositoryManager +ScienceDataRepositoryManager,SoftwarePackageManager +ScienceDataRepositoryManager,Manager control +ScienceDataRepositoryManager,TaskManager +ScienceDataRepositoryManager,import File Manager +ScienceDataRepositoryManager,OpenFileManager +ScienceDataRepositoryManager,NETCoreFileManager +ScienceDataRepositoryManager,OODTCASFileManager +ScienceDataRepositoryManager,SciencePgeConfigFileWriter +ScienceDataRepositoryManager,NodePackageManager +ScienceDataRepositoryManager,TrilloFileManager +ScienceDataRepositoryManager,KendoUIFileManager +ScienceDataRepositoryManager,WordPressFileManager +ScienceDataRepositoryManager,Manager The Repository Manager extension +ScienceDataRepositoryManager,ComputerScience +ScienceDataRepositoryManager,ExtremeJavaScriptFileManager +ScienceDataRepositoryManager,ManagerArchitecture +ScienceDataRepositoryManager,ScienceProduct +ScienceDataRepositoryManager,Manager catalog +ScienceDataRepositoryManager,RepositoryManagerTheRepositoryManager +ScienceDataRepositoryManager,CASFileManager +ScienceDataRepositoryManager,ManagerObjectModel +ScienceDataRepositoryManager,SpaceScience +ScienceDataRepositoryManager,demand Library Manager +ScienceDataRepositoryManager,DeployTrilloFileManager +ScienceDataRepositoryManager,Manager extension point +ScienceDataRepositoryManager,WebixFileManager +ScienceDataRepositoryManager,bs4 File Manager +ScienceDataRepositoryManager,ApacheOODTFileManager +ScienceDataRepositoryManagerFactory,ScienceDataRepositoryManagerFactory +ScienceDataRepositoryManagerFactory,ComputerScience +ScienceDataRepositoryManagerFactory,ScienceProduct +ScienceDataRepositoryManagerFactory,SpaceScience +ScienceDataRepositoryManagerFactory,ScienceDataProcessingSystems +TestXMLRepositoryManager,TestXMLRepositoryManager +TestXMLRepositoryManager,SoftwarePackageManager +TestXMLRepositoryManager,Manager control +TestXMLRepositoryManager,TaskManager +TestXMLRepositoryManager,import File Manager +TestXMLRepositoryManager,OpenFileManager +TestXMLRepositoryManager,NodePackageManager +TestXMLRepositoryManager,TrilloFileManager +TestXMLRepositoryManager,Manager The Repository Manager extension +TestXMLRepositoryManager,ExtremeJavaScriptFileManager +TestXMLRepositoryManager,ManagerArchitecture +TestXMLRepositoryManager,Manager catalog +TestXMLRepositoryManager,RepositoryManagerTheRepositoryManager +TestXMLRepositoryManager,CASFileManager +TestXMLRepositoryManager,ManagerObjectModel +TestXMLRepositoryManager,demand Library Manager +TestXMLRepositoryManager,Manager extension point +TestXMLRepositoryManager,WebixFileManager +TestXMLRepositoryManager,bs4 File Manager +XMLRepositoryManager,XMLRepositoryManager +XMLRepositoryManager,XMLMetadataConceptCatalog +XMLRepositoryManager,Manager control +XMLRepositoryManager,TaskManager +XMLRepositoryManager,import File Manager +XMLRepositoryManager,OpenFileManager +XMLRepositoryManager,NETCoreFileManager +XMLRepositoryManager,OODTCASFileManager +XMLRepositoryManager,XML files +XMLRepositoryManager,TrilloFileManager +XMLRepositoryManager,KendoUIFileManager +XMLRepositoryManager,WordPressFileManager +XMLRepositoryManager,Manager The Repository Manager extension +XMLRepositoryManager,ManagerIngestUseCase +XMLRepositoryManager,ExtremeJavaScriptFileManager +XMLRepositoryManager,ManagerArchitecture +XMLRepositoryManager,Manager catalog +XMLRepositoryManager,RepositoryManagerTheRepositoryManager +XMLRepositoryManager,CASFileManager +XMLRepositoryManager,DeployTrilloFileManager +XMLRepositoryManager,WebixFileManager +XMLRepositoryManager,bs4 File Manager +XMLRepositoryManager,ApacheOODTFileManager +XMLRepositoryManagerFactory,XMLRepositoryManagerFactory +XMLRepositoryManagerFactory,XML files +Metadata,Metadata +CoreMetKeys,CoreMetKeys +CoreMetKeys,Core machines +CoreMetKeys,CoreData +FileAttributesMetKeys,FileAttributesMetKeys +FileAttributesMetKeys,File Managers functionality +FileAttributesMetKeys,File Browser dialogs +FileAttributesMetKeys,FileRetrievalSystem +FileAttributesMetKeys,FileUploadComponent +FileAttributesMetKeys,DevExtremeJavaScriptFile +FileAttributesMetKeys,FileLocationExtractor +FileAttributesMetKeys,FilePctTransferred +FileAttributesMetKeys,File Repository layouts +FileAttributesMetKeys,File content +FileAttributesMetKeys,JSXFile +FileAttributesMetKeys,File type +ProductMetKeys,ProductMetKeys +ProductMetKeys,Product structures +ProductMetKeys,ProductMetadata +ProductMetKeys,ProductId +ProductMetKeys,ProductTypes +ProductMetKeys,ScienceProduct +ProductMetKeys,ProductStructure +ProductMetKeys,Product operation +FilemgrMetExtractor,FilemgrMetExtractor +FilemgrMetExtractor,CoreMetExtractor +AbstractFilemgrMetExtractor,AbstractFilemgrMetExtractor +AbstractFilemgrMetExtractor,MimeTypeExtractor +AbstractFilemgrMetExtractor,CoreMetExtractor +CoreMetExtractor,CoreMetExtractor +CoreMetExtractor,Core machines +CoreMetExtractor,Extractor extractors +CoreMetExtractor,MimeTypeExtractor +CoreMetExtractor,CoreData +TestAbstractFilemgrMetExtractor,TestAbstractFilemgrMetExtractor +TestAbstractFilemgrMetExtractor,Extractor extractors +TestAbstractFilemgrMetExtractor,MimeTypeExtractor +TestAbstractFilemgrMetExtractor,CoreMetExtractor +TestCoreMetExtractor,TestCoreMetExtractor +TestCoreMetExtractor,Extractor extractors +TestCoreMetExtractor,MimeTypeExtractor +TestCoreMetExtractor,CoreMetExtractor +Catalog,Catalog +CatalogFactory,CatalogFactory +DataSourceCatalog,DataSourceCatalog +DataSourceCatalog,XMLMetadataConceptCatalog +DataSourceCatalog,Data nodes +DataSourceCatalog,LuceneIndexCatalog +DataSourceCatalog,Catalog extension point interface +DataSourceCatalog,Data Transfer extension point +DataSourceCatalog,Data products +DataSourceCatalog,DataNodes +DataSourceCatalog,Data registry +DataSourceCatalog,CoreData +DataSourceCatalog,DataSystemPDS4InformationModel-Driven +DataSourceCatalog,LuceneCatalog +DataSourceCatalog,DataList +DataSourceCatalog,Catalog interface +DataSourceCatalog,GoddardEarthScienceData +DataSourceCatalog,DataStructure +DataSourceCatalog,DataGridManagementSystem +DataSourceCatalog,Data volume +DataSourceCatalogFactory,DataSourceCatalogFactory +DataSourceCatalogFactory,Data nodes +DataSourceCatalogFactory,DataIntensiveScientific +DataSourceCatalogFactory,MoneydanceFinancialData +DataSourceCatalogFactory,Data Transfer interface +DataSourceCatalogFactory,Data products +DataSourceCatalogFactory,DataNodes +DataSourceCatalogFactory,Data registry +DataSourceCatalogFactory,CoreData +DataSourceCatalogFactory,DataSystemPDS4InformationModel-Driven +DataSourceCatalogFactory,DataList +DataSourceCatalogFactory,DataStructure +DataSourceCatalogFactory,DataManagementSystems +DataSourceCatalogFactory,Data volume +DataSourceCatalogFactory,DataSystemDevelopment +LenientDataSourceCatalog,LenientDataSourceCatalog +LenientDataSourceCatalog,LuceneIndexCatalog +LenientDataSourceCatalog,AMGAMetadataCatalog +LenientDataSourceCatalog,Catalog extension point +LenientDataSourceCatalog,LuceneCatalog +LenientDataSourceCatalog,Catalog interface +LenientDataSourceCatalog,ApacheOODTCatalog +LuceneCatalog,LuceneCatalog +LuceneCatalog,XMLMetadataConceptCatalog +LuceneCatalog,LuceneIndexCatalog +LuceneCatalog,LuceneIndexCatalog +LuceneCatalog,AMGAMetadataCatalog +LuceneCatalog,Catalog extension point interface +LuceneCatalog,Catalog extension point +LuceneCatalog,ApacheOODTCatalog +LuceneCatalog,default Apache Lucene +LuceneCatalogFactory,LuceneCatalogFactory +LuceneCatalogFactory,LuceneCatalog +MappedDataSourceCatalog,MappedDataSourceCatalog +MappedDataSourceCatalog,LuceneIndexCatalog +MappedDataSourceCatalog,AMGAMetadataCatalog +MappedDataSourceCatalog,Catalog extension point +MappedDataSourceCatalog,LuceneCatalog +MappedDataSourceCatalog,Catalog interface +MappedDataSourceCatalog,ApacheOODTCatalog +MappedDataSourceCatalogFactory,MappedDataSourceCatalogFactory +MockCatalog,MockCatalog +MockCatalogFactory,MockCatalogFactory +ScienceDataCatalog,ScienceDataCatalog +ScienceDataCatalog,XMLMetadataConceptCatalog +ScienceDataCatalog,AMGAMetadataCatalog +ScienceDataCatalog,SciencePgeConfigFileWriter +ScienceDataCatalog,Catalog extension point interface +ScienceDataCatalog,ComputerScience +ScienceDataCatalog,ScienceProduct +ScienceDataCatalog,LuceneCatalog +ScienceDataCatalog,Catalog interface +ScienceDataCatalog,ApacheOODTCatalog +ScienceDataCatalog,SpaceScience +ScienceDataCatalog,ScienceDataProcessingSystems +ScienceDataCatalogFactory,ScienceDataCatalogFactory +ScienceDataCatalogFactory,SciencePgeConfigFileWriter +ScienceDataCatalogFactory,ComputerScience +ScienceDataCatalogFactory,ScienceProduct +ScienceDataCatalogFactory,SpaceScience +Catalog,Catalog +ProductIdGenerator,ProductIdGenerator +ProductIdGenerator,Product structures +ProductIdGenerator,ProductId +ProductIdGenerator,ProductTypes +ProductIdGenerator,ScienceProduct +ProductIdGenerator,ProductStructure +ProductIdGenerator,Product operation +ProductSerializer,ProductSerializer +CompleteProduct,CompleteProduct +CompleteProduct,ProductGenerationExecutives +CompleteProduct,ProductPctTransferred +CompleteProduct,ScienceProduct +CompleteProduct,Product instance information +CompleteProduct,Product listing pages +DefaultProductSerializer,DefaultProductSerializer +DefaultProductSerializer,Default configuration +NameProductIdGenerator,NameProductIdGenerator +NameProductIdGenerator,NameNode +Parameters,Parameters +QueryResponse,QueryResponse +SolrCatalog,SolrCatalog +SolrCatalogFactory,SolrCatalogFactory +SolrClient,SolrClient +UUIDProductIdGenerator,UUIDProductIdGenerator +Validation,Validation +ValidationLayer,ValidationLayer +ValidationLayer,ValidationLayerTheValidationLayer +ValidationLayer,ValidationLayerTheValidationLayer +ValidationLayer,Validation Layer extension point +ValidationLayer,Layer extension point +ValidationLayer,Layer The Validation Layer extension +ValidationLayerFactory,ValidationLayerFactory +ValidationLayerFactory,ValidationLayerTheValidationLayer +ValidationLayerFactory,Validation Layer extension point +DataSourceValidationLayer,DataSourceValidationLayer +DataSourceValidationLayer,Data nodes +DataSourceValidationLayer,DataIntensiveScientific +DataSourceValidationLayer,DataView +DataSourceValidationLayer,MoneydanceFinancialData +DataSourceValidationLayer,ValidationLayerTheValidationLayer +DataSourceValidationLayer,Data products +DataSourceValidationLayer,DataNodes +DataSourceValidationLayer,Data registry +DataSourceValidationLayer,CoreData +DataSourceValidationLayer,DataSystemPDS4InformationModel-Driven +DataSourceValidationLayer,DataList +DataSourceValidationLayer,DataSource +DataSourceValidationLayer,DataProtocols +DataSourceValidationLayer,Layer extension point +DataSourceValidationLayer,DataStructure +DataSourceValidationLayer,Layer The Validation Layer extension +DataSourceValidationLayer,DataGridManagementSystem +DataSourceValidationLayer,DataManagementSystems +DataSourceValidationLayer,Data volume +DataSourceValidationLayer,DataSystemDevelopment +DataSourceValidationLayerFactory,DataSourceValidationLayerFactory +DataSourceValidationLayerFactory,Data nodes +DataSourceValidationLayerFactory,DataIntensiveScientific +DataSourceValidationLayerFactory,DataView +DataSourceValidationLayerFactory,MoneydanceFinancialData +DataSourceValidationLayerFactory,Data Transfer extension point +DataSourceValidationLayerFactory,Data products +DataSourceValidationLayerFactory,DataNodes +DataSourceValidationLayerFactory,Data registry +DataSourceValidationLayerFactory,CoreData +DataSourceValidationLayerFactory,DataList +DataSourceValidationLayerFactory,GoddardEarthScienceData +DataSourceValidationLayerFactory,DataSource +DataSourceValidationLayerFactory,DataProtocols +DataSourceValidationLayerFactory,DataStructure +DataSourceValidationLayerFactory,DataGridManagementSystem +DataSourceValidationLayerFactory,DataManagementSystems +DataSourceValidationLayerFactory,Data volume +DataSourceValidationLayerFactory,DataSystemDevelopment +ScienceDataValidationLayer,ScienceDataValidationLayer +ScienceDataValidationLayer,SciencePgeConfigFileWriter +ScienceDataValidationLayer,ValidationLayerTheValidationLayer +ScienceDataValidationLayer,ComputerScience +ScienceDataValidationLayer,ScienceProduct +ScienceDataValidationLayer,GameScience +ScienceDataValidationLayer,SpaceScience +ScienceDataValidationLayer,Layer extension point +ScienceDataValidationLayer,ScienceDataProcessingSystems +ScienceDataValidationLayer,Layer The Validation Layer extension +ScienceDataValidationLayerFactory,ScienceDataValidationLayerFactory +ScienceDataValidationLayerFactory,ComputerScience +ScienceDataValidationLayerFactory,ScienceProduct +ScienceDataValidationLayerFactory,GameScience +ScienceDataValidationLayerFactory,SpaceScience +ScienceDataValidationLayerFactory,ScienceDataProcessingSystems +TestXMLValidationLayer,TestXMLValidationLayer +TestXMLValidationLayer,ValidationLayerTheValidationLayer +TestXMLValidationLayer,Layer extension point +TestXMLValidationLayer,Layer The Validation Layer extension +XMLValidationLayer,XMLValidationLayer +XMLValidationLayer,XMLMetadataConceptCatalog +XMLValidationLayer,XML files +XMLValidationLayer,ValidationLayerTheValidationLayer +XMLValidationLayer,XML Metadata Concept catalog +XMLValidationLayer,XML syntax +XMLValidationLayer,Layer The Validation Layer extension +XMLValidationLayerFactory,XMLValidationLayerFactory +XMLValidationLayerFactory,XML files +XMLValidationLayerFactory,XML syntax +versioning,versioning +Versioner,Versioner +AcquisitionDateVersioner,AcquisitionDateVersioner +BasicVersioner,BasicVersioner +BasicVersioner,Versioner extension point +ConfigurableMetadataBasedFileVersioner,ConfigurableMetadataBasedFileVersioner +ConfigurableMetadataBasedFileVersioner,Versioner extension point +DateTimeVersioner,DateTimeVersioner +DirectoryProductVersioner,DirectoryProductVersioner +InPlaceVersioner,InPlaceVersioner +MetadataBasedFileVersioner,MetadataBasedFileVersioner +MetadataBasedFileVersioner,Versioner extension point +MetadataBasedFileVersioner,ProductMetadata +MetadataBasedFileVersioner,Metadata objects +MetadataBasedFileVersioner,Metadata information +MetadataBasedFileVersioner,Metadata management +ProductTypeMetVersioner,ProductTypeMetVersioner +ProductTypeMetVersioner,Versioner extension point +ProductTypeMetVersioner,Product structures +ProductTypeMetVersioner,ProductMetadata +ProductTypeMetVersioner,ProductGenerationExecutives +ProductTypeMetVersioner,ProductPctTransferred +ProductTypeMetVersioner,ProductId +ProductTypeMetVersioner,ProductTypes +ProductTypeMetVersioner,ScienceProduct +ProductTypeMetVersioner,ProductStructure +ProductTypeMetVersioner,Product operation +ProductTypeMetVersioner,Product instance information +ProductTypeMetVersioner,Product listing pages +SingleFileBasicVersioner,SingleFileBasicVersioner +SingleFileBasicVersioner,Versioner extension point +TestAcquisitionDateVersioner,TestAcquisitionDateVersioner +TestBasicVersioner,TestBasicVersioner +TestConfigurableMetadataBasedFileVersioner,TestConfigurableMetadataBasedFileVersioner +TestConfigurableMetadataBasedFileVersioner,Versioner extension point +TestDateTimeVersioner,TestDateTimeVersioner +TestDirectoryBasedProductVersioner,TestDirectoryBasedProductVersioner +TestDirectoryBasedProductVersioner,Versioner extension point +TestInPlaceVersioner,TestInPlaceVersioner +TestInPlaceVersioner,Versioner extension point +TestMetadataBasedFileVersioner,TestMetadataBasedFileVersioner +TestMetadataBasedFileVersioner,Versioner extension point +TestProductTypeMetVersioner,TestProductTypeMetVersioner +TestProductTypeMetVersioner,Versioner extension point +TestSingleFileBasicVersioner,TestSingleFileBasicVersioner +TestSingleFileBasicVersioner,Versioner extension point +VersioningUtils,VersioningUtils +ingest,ingest +Cache,Cache +CacheFactory,CacheFactory +Ingester,Ingester +RemoteableCache,RemoteableCache +AbstractCacheServerFactory,AbstractCacheServerFactory +CachedIngester,CachedIngester +CmdLineIngester,CmdLineIngester +LocalCache,LocalCache +LocalCacheFactory,LocalCacheFactory +RmiCache,RmiCache +RmiCacheFactory,RmiCacheFactory +RmiCacheServer,RmiCacheServer +RmiCacheServerFactory,RmiCacheServerFactory +StdIngester,StdIngester +TestCachedIngester,TestCachedIngester +TestLocalCache,TestLocalCache +TestRmiCache,TestRmiCache +TestStdIngester,TestStdIngester +datatransfer,datatransfer +DataTransfer,DataTransfer +DataTransfer,Data nodes +DataTransfer,DataIntensiveScientific +DataTransfer,LocalDataTransfer +DataTransfer,client Transfer +DataTransfer,MoneydanceFinancialData +DataTransfer,PlaceDataTransfer +DataTransfer,Data Transfer extension point +DataTransfer,Data Transfer interface +DataTransfer,Data products +DataTransfer,DataNodes +DataTransfer,RemoteDataTransfer +DataTransfer,Data registry +DataTransfer,DataSystemPDS4InformationModel-Driven +DataTransfer,DataList +DataTransfer,GoddardEarthScienceData +DataTransfer,DataStructure +DataTransfer,DataGridManagementSystem +DataTransfer,DataManagementSystems +DataTransfer,Data volume +DataTransfer,DataSystemDevelopment +DataTransferFactory,DataTransferFactory +DataTransferFactory,Data nodes +DataTransferFactory,Data Transfer extension point +DataTransferFactory,Data products +DataTransferFactory,DataNodes +DataTransferFactory,Data registry +DataTransferFactory,CoreData +DataTransferFactory,DataSystemPDS4InformationModel-Driven +DataTransferFactory,DataList +DataTransferFactory,GoddardEarthScienceData +DataTransferFactory,DataStructure +DataTransferFactory,DataGridManagementSystem +DataTransferFactory,Data volume +InPlaceDataTransferer,InPlaceDataTransferer +InPlaceDataTransferFactory,InPlaceDataTransferFactory +LocalDataTransferer,LocalDataTransferer +LocalDataTransferFactory,LocalDataTransferFactory +LocalDataTransferFactory,LocalDataTransfer +RemoteDataTransferer,RemoteDataTransferer +RemoteDataTransferer,RemoteFileSystemProvider +RemoteDataTransferFactory,RemoteDataTransferFactory +RemoteDataTransferFactory,RemoteDataTransfer +S3DataTransferer,S3DataTransferer +S3DataTransfererFactory,S3DataTransfererFactory +TestInPlaceDataTransferer,TestInPlaceDataTransferer +TestLocalDataTransferer,TestLocalDataTransferer +TestS3DataTransferer,TestS3DataTransferer +TestS3DataTransfererFactory,TestS3DataTransfererFactory +TransferStatusTracker,TransferStatusTracker +Cli,Cli +TestFileManagerCli,TestFileManagerCli +UseMockClientCmdLineActionStore,UseMockClientCmdLineActionStore +UseMockClientCmdLineActionStore,UseGit +UseMockClientCmdLineActionStore,Use npm +UseMockClientCmdLineActionStore,ElementStore +AbstractDeleteProductCliAction,AbstractDeleteProductCliAction +AbstractGetProductCliAction,AbstractGetProductCliAction +AbstractQueryCliAction,AbstractQueryCliAction +AddProductTypeCliAction,AddProductTypeCliAction +DeleteProductByIdCliAction,DeleteProductByIdCliAction +DeleteProductByNameCliAction,DeleteProductByNameCliAction +DumpMetadataCliAction,DumpMetadataCliAction +FileManagerCliAction,FileManagerCliAction +FileManagerCliAction,FileManagerAug +FileManagerCliAction,File Browser dialogs +FileManagerCliAction,FileManagerServer +FileManagerCliAction,FileManagerUsage +FileManagerCliAction,File Browser plugin +FileManagerCliAction,FileManagerPage +FileManagerCliAction,FileManagerManage +FileManagerCliAction,File Manager architecture +FileManagerCliAction,File Manager file System Provider +FileManagerCliAction,File information +FileManagerCliAction,File Manager loads +FileManagerCliAction,File Manager services +GetCurrentTransferCliAction,GetCurrentTransferCliAction +GetCurrentTransfersCliAction,GetCurrentTransfersCliAction +GetFilePercentTransferredCliAction,GetFilePercentTransferredCliAction +GetFirstPageCliAction,GetFirstPageCliAction +GetLastPageCliAction,GetLastPageCliAction +GetNextPageCliAction,GetNextPageCliAction +GetNumProductsCliAction,GetNumProductsCliAction +GetPrevPageCliAction,GetPrevPageCliAction +GetProductByIdCliAction,GetProductByIdCliAction +GetProductByNameCliAction,GetProductByNameCliAction +GetProductPercentTransferredCliAction,GetProductPercentTransferredCliAction +GetProductTypeByNameCliAction,GetProductTypeByNameCliAction +HasProductCliAction,HasProductCliAction +IngestProductCliAction,IngestProductCliAction +LuceneQueryCliAction,LuceneQueryCliAction +LuceneQueryCliAction,LuceneIndexCatalog +LuceneQueryCliAction,default Apache Lucene +RetrieveFilesCliAction,RetrieveFilesCliAction +SqlQueryCliAction,SqlQueryCliAction +Util,Util +Pagination,Pagination +DbStructFactory,DbStructFactory +GenericFileManagerObjectFactory,GenericFileManagerObjectFactory +QueryUtils,QueryUtils +SqlParser,SqlParser +TestGenericFileManagerObjectStructFactory,TestGenericFileManagerObjectStructFactory +TestXmlRpcStructFactory,TestXmlRpcStructFactory +TestXmlStructFactory,TestXmlStructFactory +XmlRpcStructFactory,XmlRpcStructFactory +XmlStructFactory,XmlStructFactory +tools,tools +CASAnalyzer,CASAnalyzer +CatalogSearch,CatalogSearch +CatalogSearch,XMLMetadataConceptCatalog +CatalogSearch,LuceneIndexCatalog +CatalogSearch,AMGAMetadataCatalog +CatalogSearch,Catalog extension point interface +CatalogSearch,Catalog extension point +CatalogSearch,Catalog interface +CatalogSearch,ApacheOODTCatalog +DeleteProduct,DeleteProduct +DeleteProduct,ProductGenerationExecutives +DeleteProduct,ProductPctTransferred +DeleteProduct,ScienceProduct +DeleteProduct,Product instance information +DeleteProduct,Product instance metadata +DumpDbElementsToXml,DumpDbElementsToXml +ExpImpCatalog,ExpImpCatalog +MetadataBasedProductMover,MetadataBasedProductMover +MetadataBasedProductMover,ProductMetadata +MetadataBasedProductMover,Metadata objects +MetadataBasedProductMover,Metadata information +MetadataBasedProductMover,Metadata management +MetadataBasedProductMover,custom PGE Metadata +MetadataBasedProductMover,Metadata generation +MetadataDumper,MetadataDumper +OptimizeLuceneCatalog,OptimizeLuceneCatalog +OptimizeLuceneCatalog,XMLMetadataConceptCatalog +OptimizeLuceneCatalog,Catalog extension point interface +OptimizeLuceneCatalog,LuceneCatalog +ProductDumper,ProductDumper +ProductTypeDocTool,ProductTypeDocTool +ProductTypeDocTool,Product structures +ProductTypeDocTool,ProductMetadata +ProductTypeDocTool,ProductGenerationExecutives +ProductTypeDocTool,QueryTool +ProductTypeDocTool,ProductPctTransferred +ProductTypeDocTool,ProductId +ProductTypeDocTool,ProductTypes +ProductTypeDocTool,ScienceProduct +ProductTypeDocTool,ProductStructure +ProductTypeDocTool,Product operation +ProductTypeDocTool,Product instance information +ProductTypeDocTool,Product listing pages +QueryTool,QueryTool +RangeQueryTester,RangeQueryTester +SolrIndexer,SolrIndexer +CliAction,CliAction diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt new file mode 100644 index 0000000000000000000000000000000000000000..86d84701d53f6f7aafc94d9d5d26256becbe1abe --- /dev/null +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY-ziyan.txt @@ -0,0 +1,679 @@ +structs , structs +Boolean Query Criteria , Boolean Query Criteria +Element , Element +Extractor Spec , Extractor Spec +Extractor Spec , spicified Extractor extractors +Extractor Spec , Mime Type Extractor +Extractor Spec , Core Met Extractor +Extractor Spec , Job Spec +File Transfer Status , File Transfer Status +File Transfer Status , File Manager extension points +File Transfer Status , File Manager Dashboard Nulled +File Transfer Status , Dev Extreme Java Script File +File Transfer Status , File Manager UI component +File Transfer Status , File Manager server interface +File Transfer Status , File Manager Command class +File Transfer Status , File Manager web interface +File Transfer Status , File Manager App UI +File Transfer Status , File content +File Transfer Status , JSX File +File Transfer Status , File Manager file System Provider +File Transfer Status , File type +File Transfer Status , File information +Free Text Query Criteria , Free Text Query Criteria +Free Text Query Criteria , Free Pastry +Free Text Query Criteria , implementation Free +Product , Product +Product Page , Product Page +Product Page , Product structures +Product Page , Product Metadata +Product Page , Product Generation Executives +Product Page , Product Pct Transferred +Product Page , Product Id +Product Page , File Manager Page +Product Page , Product Types +Product Page , Product Structure +Product Page , Product operation +Product Page , Product instance information +Product Page , Product listing pages +Product Type , Product Type +Product Type , Product structures +Product Type , Product Metadata +Product Type , Type Script script +Product Type , Product Generation Executives +Product Type , Product Pct Transferred +Product Type , Product Id +Product Type , Product Types +Product Type , Product Structure +Product Type , Product operation +Product Type , Product instance information +Product Type , product Type +Product Type , Product listing pages +Query , Query +Query Criteria , Query Criteria +Range Query Criteria , Range Query Criteria +Reference , Reference +Term Query Criteria , Term Query Criteria +Test Product , Test Product +Test Reference , Test Reference +System , System +Xml Rpc File Manager , Xml Rpc File Manager +Xml Rpc File Manager , import File Manager +Xml Rpc File Manager , Open File Manager +Xml Rpc File Manager , Xml Rpc File Manager Client +Xml Rpc File Manager , Trillo File Manager +Xml Rpc File Manager , Kendo UI File Manager +Xml Rpc File Manager , Manager The Repository Manager extension +Xml Rpc File Manager , Extreme Java Script File Manager +Xml Rpc File Manager , Repository Manager The Repository Manager +Xml Rpc File Manager , CAS File Manager +Xml Rpc File Manager , Xml Rpc File Manager complex +Xml Rpc File Manager , Manager extension point +Xml Rpc File Manager , Webix File Manager +Xml Rpc File Manager , bs4 File Manager +Xml Rpc File Manager , Apache OODT File Manager +Xml Rpc File Manager Client , Xml Rpc File Manager Client +Xml Rpc File Manager Client , Client implementation +Xml Rpc File Manager Client , Client React connector +Xml Rpc File Manager Client , Client React component +Xml Rpc File Manager Client , Client side metadata +Xml Rpc File Manager Client , Mahasen Client +Dispatcher , Dispatcher +Result , Result +Secure Web Server , Secure Web Server +Secure Web Server , Apache HTTPD Web Server +Secure Web Server , Server Node API v1 Localization +Secure Web Server , Windows Server +Repository , Repository +Repository Manager , Repository Manager +Repository Manager , Software Package Manager +Repository Manager , Task Manager +Repository Manager , import File Manager +Repository Manager , Open File Manager +Repository Manager , NET Core File Manager +Repository Manager , OODT CAS File Manager +Repository Manager , Node Package Manager +Repository Manager , Trillo File Manager +Repository Manager , Kendo UI File Manager +Repository Manager , Word Press File Manager +Repository Manager , Manager The Repository Manager extension +Repository Manager , Manager Ingest Use Case +Repository Manager , Extreme Java Script File Manager +Repository Manager , Repository Manager extension point +Repository Manager , Repository Manager The Repository Manager +Repository Manager , Repository Manager The Repository Manager +Repository Manager , CAS File Manager +Repository Manager , Manager Object Model +Repository Manager , demand Library Manager +Repository Manager , Deploy Trillo File Manager +Repository Manager , Manager extension point +Repository Manager , Webix File Manager +Repository Manager , bs4 File Manager +Repository Manager , Apache OODT File Manager +Repository Manager Factory , Repository Manager Factory +Repository Manager Factory , Repository Manager extension point +Repository Manager Factory , Repository Manager The Repository Manager +Data Source Repository Manager , Data Source Repository Manager +Data Source Repository Manager , Software Package Manager +Data Source Repository Manager , Data nodes +Data Source Repository Manager , Manager control +Data Source Repository Manager , Task Manager +Data Source Repository Manager , import File Manager +Data Source Repository Manager , Open File Manager +Data Source Repository Manager , NET Core File Manager +Data Source Repository Manager , OODT CAS File Manager +Data Source Repository Manager , Node Package Manager +Data Source Repository Manager , Moneydance Financial Data +Data Source Repository Manager , Data Transfer interface +Data Source Repository Manager , Manager The Repository Manager extension +Data Source Repository Manager , Data products +Data Source Repository Manager , Extreme Java Script File Manager +Data Source Repository Manager , Manager Architecture +Data Source Repository Manager , Data registry +Data Source Repository Manager , Manager catalog +Data Source Repository Manager , Repository Manager The Repository Manager +Data Source Repository Manager , CAS File Manager +Data Source Repository Manager , Manager Object Model +Data Source Repository Manager , demand Library Manager +Data Source Repository Manager , Manager extension point +Data Source Repository Manager , Webix File Manager +Data Source Repository Manager , Data Management Systems +Data Source Repository Manager , Data volume +Data Source Repository Manager , Data System Development +Data Source Repository Manager Factory , Data Source Repository Manager Factory +Data Source Repository Manager Factory , Data nodes +Data Source Repository Manager Factory , Moneydance Financial Data +Data Source Repository Manager Factory , Data Transfer extension point +Data Source Repository Manager Factory , Data Transfer interface +Data Source Repository Manager Factory , Data products +Data Source Repository Manager Factory , Data registry +Data Source Repository Manager Factory , Data Grid Management System +Data Source Repository Manager Factory , Data Management Systems +Data Source Repository Manager Factory , Data volume +Data Source Repository Manager Factory , Data System Development +Science Data Repository Manager , Science Data Repository Manager +Science Data Repository Manager , Software Package Manager +Science Data Repository Manager , Manager control +Science Data Repository Manager , Task Manager +Science Data Repository Manager , import File Manager +Science Data Repository Manager , Open File Manager +Science Data Repository Manager , NET Core File Manager +Science Data Repository Manager , OODT CAS File Manager +Science Data Repository Manager , Science Pge Config File Writer +Science Data Repository Manager , Node Package Manager +Science Data Repository Manager , Trillo File Manager +Science Data Repository Manager , Kendo UI File Manager +Science Data Repository Manager , Word Press File Manager +Science Data Repository Manager , Manager The Repository Manager extension +Science Data Repository Manager , Computer Science +Science Data Repository Manager , Extreme Java Script File Manager +Science Data Repository Manager , Manager Architecture +Science Data Repository Manager , Science Product +Science Data Repository Manager , Manager catalog +Science Data Repository Manager , Repository Manager The Repository Manager +Science Data Repository Manager , CAS File Manager +Science Data Repository Manager , Manager Object Model +Science Data Repository Manager , Space Science +Science Data Repository Manager , demand Library Manager +Science Data Repository Manager , Deploy Trillo File Manager +Science Data Repository Manager , Manager extension point +Science Data Repository Manager , Webix File Manager +Science Data Repository Manager , bs4 File Manager +Science Data Repository Manager , Apache OODT File Manager +Science Data Repository Manager Factory , Science Data Repository Manager Factory +Science Data Repository Manager Factory , Computer Science +Science Data Repository Manager Factory , Science Product +Science Data Repository Manager Factory , Space Science +Science Data Repository Manager Factory , Science Data Processing Systems +Test XML Repository Manager , Test XML Repository Manager +Test XML Repository Manager , Software Package Manager +Test XML Repository Manager , Manager control +Test XML Repository Manager , Task Manager +Test XML Repository Manager , import File Manager +Test XML Repository Manager , Open File Manager +Test XML Repository Manager , Node Package Manager +Test XML Repository Manager , Trillo File Manager +Test XML Repository Manager , Manager The Repository Manager extension +Test XML Repository Manager , Extreme Java Script File Manager +Test XML Repository Manager , Manager Architecture +Test XML Repository Manager , Manager catalog +Test XML Repository Manager , Repository Manager The Repository Manager +Test XML Repository Manager , CAS File Manager +Test XML Repository Manager , Manager Object Model +Test XML Repository Manager , demand Library Manager +Test XML Repository Manager , Manager extension point +Test XML Repository Manager , Webix File Manager +Test XML Repository Manager , bs4 File Manager +XML Repository Manager , XML Repository Manager +XML Repository Manager , XML Metadata Concept Catalog +XML Repository Manager , Manager control +XML Repository Manager , Task Manager +XML Repository Manager , import File Manager +XML Repository Manager , Open File Manager +XML Repository Manager , NET Core File Manager +XML Repository Manager , OODT CAS File Manager +XML Repository Manager , XML files +XML Repository Manager , Trillo File Manager +XML Repository Manager , Kendo UI File Manager +XML Repository Manager , Word Press File Manager +XML Repository Manager , Manager The Repository Manager extension +XML Repository Manager , Manager Ingest Use Case +XML Repository Manager , Extreme Java Script File Manager +XML Repository Manager , Manager Architecture +XML Repository Manager , Manager catalog +XML Repository Manager , Repository Manager The Repository Manager +XML Repository Manager , CAS File Manager +XML Repository Manager , Deploy Trillo File Manager +XML Repository Manager , Webix File Manager +XML Repository Manager , bs4 File Manager +XML Repository Manager , Apache OODT File Manager +XML Repository Manager Factory , XML Repository Manager Factory +XML Repository Manager Factory , XML files +Metadata , Metadata +Core Met Keys , Core Met Keys +Core Met Keys , Core machines +Core Met Keys , Core Data +File Attributes Met Keys , File Attributes Met Keys +File Attributes Met Keys , File Managers functionality +File Attributes Met Keys , File Browser dialogs +File Attributes Met Keys , File Retrieval System +File Attributes Met Keys , File Upload Component +File Attributes Met Keys , Dev Extreme Java Script File +File Attributes Met Keys , File Location Extractor +File Attributes Met Keys , File Pct Transferred +File Attributes Met Keys , File Repository layouts +File Attributes Met Keys , File content +File Attributes Met Keys , JSX File +File Attributes Met Keys , File type +Product Met Keys , Product Met Keys +Product Met Keys , Product structures +Product Met Keys , Product Metadata +Product Met Keys , Product Id +Product Met Keys , Product Types +Product Met Keys , Science Product +Product Met Keys , Product Structure +Product Met Keys , Product operation +Filemgr Met Extractor , Filemgr Met Extractor +Filemgr Met Extractor , Core Met Extractor +Abstract Filemgr Met Extractor , Abstract Filemgr Met Extractor +Abstract Filemgr Met Extractor , Mime Type Extractor +Abstract Filemgr Met Extractor , Core Met Extractor +Core Met Extractor , Core Met Extractor +Core Met Extractor , Core machines +Core Met Extractor , Extractor extractors +Core Met Extractor , Mime Type Extractor +Core Met Extractor , Core Data +Test Abstract Filemgr Met Extractor , Test Abstract Filemgr Met Extractor +Test Abstract Filemgr Met Extractor , Extractor extractors +Test Abstract Filemgr Met Extractor , Mime Type Extractor +Test Abstract Filemgr Met Extractor , Core Met Extractor +Test Core Met Extractor , Test Core Met Extractor +Test Core Met Extractor , Extractor extractors +Test Core Met Extractor , Mime Type Extractor +Test Core Met Extractor , Core Met Extractor +Catalog , Catalog +Catalog Factory , Catalog Factory +Data Source Catalog , Data Source Catalog +Data Source Catalog , XML Metadata Concept Catalog +Data Source Catalog , Data nodes +Data Source Catalog , Lucene Index Catalog +Data Source Catalog , Catalog extension point interface +Data Source Catalog , Data Transfer extension point +Data Source Catalog , Data products +Data Source Catalog , Data Nodes +Data Source Catalog , Data registry +Data Source Catalog , Core Data +Data Source Catalog , Data System PDS4 Information Model-Driven +Data Source Catalog , Lucene Catalog +Data Source Catalog , Data List +Data Source Catalog , Catalog interface +Data Source Catalog , Goddard Earth Science Data +Data Source Catalog , Data Structure +Data Source Catalog , Data Grid Management System +Data Source Catalog , Data volume +Data Source Catalog Factory , Data Source Catalog Factory +Data Source Catalog Factory , Data nodes +Data Source Catalog Factory , Data Intensive Scientific +Data Source Catalog Factory , Moneydance Financial Data +Data Source Catalog Factory , Data Transfer interface +Data Source Catalog Factory , Data products +Data Source Catalog Factory , Data Nodes +Data Source Catalog Factory , Data registry +Data Source Catalog Factory , Core Data +Data Source Catalog Factory , Data System PDS4 Information Model-Driven +Data Source Catalog Factory , Data List +Data Source Catalog Factory , Data Structure +Data Source Catalog Factory , Data Management Systems +Data Source Catalog Factory , Data volume +Data Source Catalog Factory , Data System Development +Lenient Data Source Catalog , Lenient Data Source Catalog +Lenient Data Source Catalog , Lucene Index Catalog +Lenient Data Source Catalog , AMGA Metadata Catalog +Lenient Data Source Catalog , Catalog extension point +Lenient Data Source Catalog , Lucene Catalog +Lenient Data Source Catalog , Catalog interface +Lenient Data Source Catalog , Apache OODT Catalog +Lucene Catalog , Lucene Catalog +Lucene Catalog , XML Metadata Concept Catalog +Lucene Catalog , Lucene Index Catalog +Lucene Catalog , Lucene Index Catalog +Lucene Catalog , AMGA Metadata Catalog +Lucene Catalog , Catalog extension point interface +Lucene Catalog , Catalog extension point +Lucene Catalog , Apache OODT Catalog +Lucene Catalog , default Apache Lucene +Lucene Catalog Factory , Lucene Catalog Factory +Lucene Catalog Factory , Lucene Catalog +Mapped Data Source Catalog , Mapped Data Source Catalog +Mapped Data Source Catalog , Lucene Index Catalog +Mapped Data Source Catalog , AMGA Metadata Catalog +Mapped Data Source Catalog , Catalog extension point +Mapped Data Source Catalog , Lucene Catalog +Mapped Data Source Catalog , Catalog interface +Mapped Data Source Catalog , Apache OODT Catalog +Mapped Data Source Catalog Factory , Mapped Data Source Catalog Factory +Mock Catalog , Mock Catalog +Mock Catalog Factory , Mock Catalog Factory +Science Data Catalog , Science Data Catalog +Science Data Catalog , XML Metadata Concept Catalog +Science Data Catalog , AMGA Metadata Catalog +Science Data Catalog , Science Pge Config File Writer +Science Data Catalog , Catalog extension point interface +Science Data Catalog , Computer Science +Science Data Catalog , Science Product +Science Data Catalog , Lucene Catalog +Science Data Catalog , Catalog interface +Science Data Catalog , Apache OODT Catalog +Science Data Catalog , Space Science +Science Data Catalog , Science Data Processing Systems +Science Data Catalog Factory , Science Data Catalog Factory +Science Data Catalog Factory , Science Pge Config File Writer +Science Data Catalog Factory , Computer Science +Science Data Catalog Factory , Science Product +Science Data Catalog Factory , Space Science +Catalog , Catalog +Product Id Generator , Product Id Generator +Product Id Generator , Product structures +Product Id Generator , Product Id +Product Id Generator , Product Types +Product Id Generator , Science Product +Product Id Generator , Product Structure +Product Id Generator , Product operation +Product Serializer , Product Serializer +Complete Product , Complete Product +Complete Product , Product Generation Executives +Complete Product , Product Pct Transferred +Complete Product , Science Product +Complete Product , Product instance information +Complete Product , Product listing pages +Default Product Serializer , Default Product Serializer +Default Product Serializer , Default configuration +Name Product Id Generator , Name Product Id Generator +Name Product Id Generator , Name Node +Parameters , Parameters +Query Response , Query Response +Solr Catalog , Solr Catalog +Solr Catalog Factory , Solr Catalog Factory +Solr Client , Solr Client +UUID Product Id Generator , UUID Product Id Generator +Validation , Validation +Validation Layer , Validation Layer +Validation Layer , Validation Layer The Validation Layer +Validation Layer , Validation Layer The Validation Layer +Validation Layer , Validation Layer extension point +Validation Layer , Layer extension point +Validation Layer , Layer The Validation Layer extension +Validation Layer Factory , Validation Layer Factory +Validation Layer Factory , Validation Layer The Validation Layer +Validation Layer Factory , Validation Layer extension point +Data Source Validation Layer , Data Source Validation Layer +Data Source Validation Layer , Data nodes +Data Source Validation Layer , Data Intensive Scientific +Data Source Validation Layer , Data View +Data Source Validation Layer , Moneydance Financial Data +Data Source Validation Layer , Validation Layer The Validation Layer +Data Source Validation Layer , Data products +Data Source Validation Layer , Data Nodes +Data Source Validation Layer , Data registry +Data Source Validation Layer , Core Data +Data Source Validation Layer , Data System PDS4 Information Model-Driven +Data Source Validation Layer , Data List +Data Source Validation Layer , Data Source +Data Source Validation Layer , Data Protocols +Data Source Validation Layer , Layer extension point +Data Source Validation Layer , Data Structure +Data Source Validation Layer , Layer The Validation Layer extension +Data Source Validation Layer , Data Grid Management System +Data Source Validation Layer , Data Management Systems +Data Source Validation Layer , Data volume +Data Source Validation Layer , Data System Development +Data Source Validation Layer Factory , Data Source Validation Layer Factory +Data Source Validation Layer Factory , Data nodes +Data Source Validation Layer Factory , Data Intensive Scientific +Data Source Validation Layer Factory , Data View +Data Source Validation Layer Factory , Moneydance Financial Data +Data Source Validation Layer Factory , Data Transfer extension point +Data Source Validation Layer Factory , Data products +Data Source Validation Layer Factory , Data Nodes +Data Source Validation Layer Factory , Data registry +Data Source Validation Layer Factory , Core Data +Data Source Validation Layer Factory , Data List +Data Source Validation Layer Factory , Goddard Earth Science Data +Data Source Validation Layer Factory , Data Source +Data Source Validation Layer Factory , Data Protocols +Data Source Validation Layer Factory , Data Structure +Data Source Validation Layer Factory , Data Grid Management System +Data Source Validation Layer Factory , Data Management Systems +Data Source Validation Layer Factory , Data volume +Data Source Validation Layer Factory , Data System Development +Science Data Validation Layer , Science Data Validation Layer +Science Data Validation Layer , Science Pge Config File Writer +Science Data Validation Layer , Validation Layer The Validation Layer +Science Data Validation Layer , Computer Science +Science Data Validation Layer , Science Product +Science Data Validation Layer , Game Science +Science Data Validation Layer , Space Science +Science Data Validation Layer , Layer extension point +Science Data Validation Layer , Science Data Processing Systems +Science Data Validation Layer , Layer The Validation Layer extension +Science Data Validation Layer Factory , Science Data Validation Layer Factory +Science Data Validation Layer Factory , Computer Science +Science Data Validation Layer Factory , Science Product +Science Data Validation Layer Factory , Game Science +Science Data Validation Layer Factory , Space Science +Science Data Validation Layer Factory , Science Data Processing Systems +Test XML Validation Layer , Test XML Validation Layer +Test XML Validation Layer , Validation Layer The Validation Layer +Test XML Validation Layer , Layer extension point +Test XML Validation Layer , Layer The Validation Layer extension +XML Validation Layer , XML Validation Layer +XML Validation Layer , XML Metadata Concept Catalog +XML Validation Layer , XML files +XML Validation Layer , Validation Layer The Validation Layer +XML Validation Layer , XML Metadata Concept catalog +XML Validation Layer , XML syntax +XML Validation Layer , Layer The Validation Layer extension +XML Validation Layer Factory , XML Validation Layer Factory +XML Validation Layer Factory , XML files +XML Validation Layer Factory , XML syntax +versioning , versioning +Versioner , Versioner +Acquisition Date Versioner , Acquisition Date Versioner +Basic Versioner , Basic Versioner +Basic Versioner , Versioner extension point +Configurable Metadata Based File Versioner , Configurable Metadata Based File Versioner +Configurable Metadata Based File Versioner , Versioner extension point +Date Time Versioner , Date Time Versioner +Directory Product Versioner , Directory Product Versioner +In Place Versioner , In Place Versioner +Metadata Based File Versioner , Metadata Based File Versioner +Metadata Based File Versioner , Versioner extension point +Metadata Based File Versioner , Product Metadata +Metadata Based File Versioner , Metadata objects +Metadata Based File Versioner , Metadata information +Metadata Based File Versioner , Metadata management +Product Type Met Versioner , Product Type Met Versioner +Product Type Met Versioner , Versioner extension point +Product Type Met Versioner , Product structures +Product Type Met Versioner , Product Metadata +Product Type Met Versioner , Product Generation Executives +Product Type Met Versioner , Product Pct Transferred +Product Type Met Versioner , Product Id +Product Type Met Versioner , Product Types +Product Type Met Versioner , Science Product +Product Type Met Versioner , Product Structure +Product Type Met Versioner , Product operation +Product Type Met Versioner , Product instance information +Product Type Met Versioner , Product listing pages +Single File Basic Versioner , Single File Basic Versioner +Single File Basic Versioner , Versioner extension point +Test Acquisition Date Versioner , Test Acquisition Date Versioner +Test Basic Versioner , Test Basic Versioner +Test Configurable Metadata Based File Versioner , Test Configurable Metadata Based File Versioner +Test Configurable Metadata Based File Versioner , Versioner extension point +Test Date Time Versioner , Test Date Time Versioner +Test Directory Based Product Versioner , Test Directory Based Product Versioner +Test Directory Based Product Versioner , Versioner extension point +Test In Place Versioner , Test In Place Versioner +Test In Place Versioner , Versioner extension point +Test Metadata Based File Versioner , Test Metadata Based File Versioner +Test Metadata Based File Versioner , Versioner extension point +Test Product Type Met Versioner , Test Product Type Met Versioner +Test Product Type Met Versioner , Versioner extension point +Test Single File Basic Versioner , Test Single File Basic Versioner +Test Single File Basic Versioner , Versioner extension point +Versioning Utils , Versioning Utils +ingest , ingest +Cache , Cache +Cache Factory , Cache Factory +Ingester , Ingester +Remoteable Cache , Remoteable Cache +Abstract Cache Server Factory , Abstract Cache Server Factory +Cached Ingester , Cached Ingester +CmdLine Ingester , CmdLine Ingester +Local Cache , Local Cache +Local Cache Factory , Local Cache Factory +Rmi Cache , Rmi Cache +Rmi Cache Factory , Rmi Cache Factory +Rmi Cache Server , Rmi Cache Server +Rmi Cache Server Factory , Rmi Cache Server Factory +Std Ingester , Std Ingester +Test Cached Ingester , Test Cached Ingester +Test Local Cache , Test Local Cache +Test Rmi Cache , Test Rmi Cache +Test Std Ingester , Test Std Ingester +datatransfer , datatransfer +Data Transfer , Data Transfer +Data Transfer , Data nodes +Data Transfer , Data Intensive Scientific +Data Transfer , Local Data Transfer +Data Transfer , client Transfer +Data Transfer , Moneydance Financial Data +Data Transfer , Place Data Transfer +Data Transfer , Data Transfer extension point +Data Transfer , Data Transfer interface +Data Transfer , Data products +Data Transfer , Data Nodes +Data Transfer , Remote Data Transfer +Data Transfer , Data registry +Data Transfer , Data System PDS4 Information Model-Driven +Data Transfer , Data List +Data Transfer , Goddard Earth Science Data +Data Transfer , Data Structure +Data Transfer , Data Grid Management System +Data Transfer , Data Management Systems +Data Transfer , Data volume +Data Transfer , Data System Development +Data Transfer Factory , Data Transfer Factory +Data Transfer Factory , Data nodes +Data Transfer Factory , Data Transfer extension point +Data Transfer Factory , Data products +Data Transfer Factory , Data Nodes +Data Transfer Factory , Data registry +Data Transfer Factory , Core Data +Data Transfer Factory , Data System PDS4 Information Model-Driven +Data Transfer Factory , Data List +Data Transfer Factory , Goddard Earth Science Data +Data Transfer Factory , Data Structure +Data Transfer Factory , Data Grid Management System +Data Transfer Factory , Data volume +In Place Data Transferer , In Place Data Transferer +In Place Data Transfer Factory , In Place Data Transfer Factory +Local Data Transferer , Local Data Transferer +Local Data Transfer Factory , Local Data Transfer Factory +Local Data Transfer Factory , Local Data Transfer +Remote Data Transferer , Remote Data Transferer +Remote Data Transferer , Remote File System Provider +Remote Data Transfer Factory , Remote Data Transfer Factory +Remote Data Transfer Factory , Remote Data Transfer +S3Data Transferer , S3Data Transferer +S3Data Transferer Factory , S3Data Transferer Factory +Test In Place Data Transferer , Test In Place Data Transferer +Test Local Data Transferer , Test Local Data Transferer +Test S3 Data Transferer , Test S3 Data Transferer +Test S3 Data Transferer Factory , Test S3 Data Transferer Factory +Transfer Status Tracker , Transfer Status Tracker +Cli , Cli +Test File Manager Cli , Test File Manager Cli +Use Mock Client Cmd Line Action Store , Use Mock Client Cmd Line Action Store +Use Mock Client Cmd Line Action Store , Use Git +Use Mock Client Cmd Line Action Store , Use npm +Use Mock Client Cmd Line Action Store , Element Store +Abstract Delete Product Cli Action , Abstract Delete Product Cli Action +Abstract Get Product Cli Action , Abstract Get Product Cli Action +Abstract Query Cli Action , Abstract Query Cli Action +Add Product Type Cli Action , Add Product Type Cli Action +Delete Product By Id Cli Action , Delete Product By Id Cli Action +Delete Product By Name Cli Action , Delete Product By Name Cli Action +Dump Metadata Cli Action , Dump Metadata Cli Action +File Manager Cli Action , File Manager Cli Action +File Manager Cli Action , File Manager Aug +File Manager Cli Action , File Browser dialogs +File Manager Cli Action , File Manager Server +File Manager Cli Action , File Manager Usage +File Manager Cli Action , File Browser plugin +File Manager Cli Action , File Manager Page +File Manager Cli Action , File Manager Manage +File Manager Cli Action , File Manager architecture +File Manager Cli Action , File Manager file System Provider +File Manager Cli Action , File information +File Manager Cli Action , File Manager loads +File Manager Cli Action , File Manager services +Get Current Transfer Cli Action , Get Current Transfer Cli Action +Get Current Transfers Cli Action , Get Current Transfers Cli Action +Get File Percent Transferred Cli Action , Get File Percent Transferred Cli Action +Get First Page Cli Action , Get First Page Cli Action +Get Last Page Cli Action , Get Last Page Cli Action +Get Next Page Cli Action , Get Next Page Cli Action +Get Num Products Cli Action , Get Num Products Cli Action +Get Prev Page Cli Action , Get Prev Page Cli Action +Get Product By Id Cli Action , Get Product By Id Cli Action +Get Product By Name Cli Action , Get Product By Name Cli Action +Get Product Percent Transferred Cli Action , Get Product Percent Transferred Cli Action +Get Product Type By Name Cli Action , Get Product Type By Name Cli Action +Has Product Cli Action , Has Product Cli Action +Ingest Product Cli Action , Ingest Product Cli Action +Lucene Query Cli Action , Lucene Query Cli Action +Lucene Query Cli Action , Lucene Index Catalog +Lucene Query Cli Action , default Apache Lucene +Retrieve Files Cli Action , Retrieve Files Cli Action +Sql Query Cli Action , Sql Query Cli Action +Util , Util +Pagination , Pagination +Db Struct Factory , Db Struct Factory +Generic File Manager Object Factory , Generic File Manager Object Factory +Query Utils , Query Utils +Sql Parser , Sql Parser +Test Generic File Manager Object Struct Factory , Test Generic File Manager Object Struct Factory +Test Xml Rpc Struct Factory , Test Xml Rpc Struct Factory +Test Xml Struct Factory , Test Xml Struct Factory +Xml Rpc Struct Factory , Xml Rpc Struct Factory +Xml Struct Factory , Xml Struct Factory +tools , tools +CAS Analyzer , CAS Analyzer +Catalog Search , Catalog Search +Catalog Search , XML Metadata Concept Catalog +Catalog Search , Lucene Index Catalog +Catalog Search , AMGA Metadata Catalog +Catalog Search , Catalog extension point interface +Catalog Search , Catalog extension point +Catalog Search , Catalog interface +Catalog Search , Apache OODT Catalog +Delete Product , Delete Product +Delete Product , Product Generation Executives +Delete Product , Product Pct Transferred +Delete Product , Science Product +Delete Product , Product instance information +Delete Product , Product instance metadata +Dump Db Elements To Xml , Dump Db Elements To Xml +ExpImp Catalog , ExpImp Catalog +Metadata Based Product Mover , Metadata Based Product Mover +Metadata Based Product Mover , Product Metadata +Metadata Based Product Mover , Metadata objects +Metadata Based Product Mover , Metadata information +Metadata Based Product Mover , Metadata management +Metadata Based Product Mover , custom PGE Metadata +Metadata Based Product Mover , Metadata generation +Metadata Dumper , Metadata Dumper +Optimize Lucene Catalog , Optimize Lucene Catalog +Optimize Lucene Catalog , XML Metadata Concept Catalog +Optimize Lucene Catalog , Catalog extension point interface +Optimize Lucene Catalog , Lucene Catalog +Product Dumper , Product Dumper +Product Type Doc Tool , Product Type Doc Tool +Product Type Doc Tool , Product structures +Product Type Doc Tool , Product Metadata +Product Type Doc Tool , Product Generation Executives +Product Type Doc Tool , Query Tool +Product Type Doc Tool , Product Pct Transferred +Product Type Doc Tool , Product Id +Product Type Doc Tool , Product Types +Product Type Doc Tool , Science Product +Product Type Doc Tool , Product Structure +Product Type Doc Tool , Product operation +Product Type Doc Tool , Product instance information +Product Type Doc Tool , Product listing pages +Query Tool , Query Tool +Range Query Tester , Range Query Tester +Solr Indexer , Solr Indexer +Cli Action , Cli Action \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..6a7e54ee95a8fecab006df314349ac4b66ff9f8e Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-relation.txt index de3ecb351d1cf7a834e774d043c5e3cc5845f0eb..9871327f3ddbe39738ab1f1a713e4551a4e6f596 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-relation.txt @@ -1,716 +1,591 @@ -See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/232619682 -A Reusable Process Control System Framework for the Orbiting Carbon -Observatory and NPP Sounder PEATE missions -Article · July 2009 -DOI: 10.1109/SMC-IT.2009.27 -CITATIONS -21 -READS -90 -11 authors, including: -Some of the authors of this publication are also working on these related projects: -Airborne Snow Observatory View project -The Planetary Data System PDS4 Information Model-Driven Architecture View project -Daniel J. Crichton -NASA -138 PUBLICATIONS 791 CITATIONS -SEE PROFILE -Sean Hardman -NASA -20 PUBLICATIONS 53 CITATIONS -SEE PROFILE -Paul Ramirez -NASA -39 PUBLICATIONS 408 CITATIONS -SEE PROFILE -Sean Colin-Patrick Kelly -NASA -43 PUBLICATIONS 213 CITATIONS -SEE PROFILE -All content following this page was uploaded by Paul Ramirez on 22 May 2014. -The user has requested enhancement of the downloaded file. -A Reusable Process Control System Framework for the Orbiting Carbon -Observatory and NPP Sounder PEATE missions -Chris A. Mattmann, Dana Freeborn, Dan Crichton, Brian Foster, -Andrew Hart, David Woollard, Sean Hardman, Paul Ramirez, -Sean Kelly, Albert Y. Chang, Charles E. Miller -Jet Propulsion Laboratory -California Institute of Technology -Pasadena, CA 91109, USA -mattmann@jpl.nasa.gov -Abstract -We describe a reusable architecture and implementation -framework for managing science processing pipelines for -mission ground data systems. Our system, dubbed “PCS”, -for Process Control System, improves upon an existing software -component, the OODT Catalog and Archive (CAS), -which has already supported the QuikSCAT, SeaWinds and -AMT earth science missions. This paper focuses on PCS -within the context of two current earth science missions: the -Orbiting Carbon Observatory (OCO), and NPP Sounder -PEATE projects. -1 Introduction -Data volume and computational needs for Earth science -missions at NASA are growing by orders of magnitude. The -low cost of disk storage space and the increasing power -and pervasiveness of high performance computing have engendered -an era in which previously unimaginable science -questions can be answered in years rather than decades. -These science questions range from the study of sea surface -temperatures to observe maritime pollution, to measuring -atmospheric chemical composition for weather forecasting, -to obtaining a better understanding of the Earth’s global carbon -cycle and climate change as a whole. -A significant portion of any space-based NASA earth -science mission is a Ground Data System (GDS). The GDS -is responsible for receiving raw spacecraft data as delivered -from a ground station1, and processing the information -through several focused series of steps with the goal of -1A strategically placed data center on Earth with ample ground-tospace -bandwidth and connectivity for receiving satellite data. -delivering the scientific value encoded in the data to interested -scientists, both locally at an instrument team center, -and then to universities, decision makers, and the broader -science community. The processing that a GDS must perform -ranges from mundane activities including data (un- -)marshalling (removal of special space “header” information), -and subsetting, to more involved processes including -temporal and spatial positioning, calibration, and statistical -analysis, to complex scientific assimilation including -prospective and retrospective physical modeling of a scene. -Beginning with Automated Multi-Mission Operations -System (AMMOS) Multi-mission Ground Data System -(MGDS) in the early 1990s, our work has focused on building -reusable software components for GDS systems. As -an example, the Central Data Base (CDB) Subsystem of the -MGDS included data base management software comprised -of metadata and file management, file transfer capabilities, -user interfaces and data storage facilities to support multimission -telemetry data streams for current and future planetary -missions. This demanded that the CDB architecture -adhere to the architectural principles of extensibility, scalability, -and reusability. Because the CDB was and is part of -a larger system that included controlled, centralized hardware, -these architectural principles of CDB were satisfied -for AMMOS by simply ensuring that the CDB was data and -policy driven. -Our ensuing work on the Alaska SAR Facility (ASF) and -NASA Scatterometer (NSCAT) projects, made clear two -significant trends: 1) neither of these missions were part -of the controlled, centralized system for which the CDB -was developed and 2) the data management requirements -for these two missions were different from each other and -AMMOS. This meant that 1) hardware and platform choices -could not be assumed and 2) additional capabilities not originally -required for AMMOS had to be developed. In order -to meet mission schedule and cost constraints, developers -for each project independently employed a method we -coined “rapid adaptation” of the original CDB software that -resulted in two very successful mission data systems with -ultimately very few similarities or shared code. -At the time the NSCAT follow-on mission (SeaWinds on -ADEOS II) was ramping up, a technology task originally -funded by the NASA Office of Space Science was focused -on architecting and developing a common, standards-based -software framework dubbed Object Oriented Data Technology -(OODT) [12]. OODT provided “out of the box” core -data management software services while remaining adaptable -to address the (potentially evolving) requirements that -are unique from mission to mission. -Several authors of this paper supporting SeaWinds and -the OODT technology task decided to collaborate to create -a platform- and database-independent service for managing -files and tasks. The result of this collaboration was -the OODT Catalog and Archive Service component that -was architected to be reusable, reliable and scalable. The -SeaWinds (on QuikSCAT and ADEOS II) and Advanced -Communications Technology Satellite (ACTS) Mobile Terminal -(AMT) projects benefited greatly from employing -the CAS component to support their science data systems. -QuikSCAT is in its 10th year of a planned 2-year mission -and is continuing to function in a nearly lights out mode. -Hardware has been added to the system to support the unplanned -data and processing volumes (reprocessing of 7 -years of data completed in 6 months, simultaneous with -daily operations) by simply changing the software configuration. -No software engineers were required to extend the -system. -While the CAS component successfully supported Sea- -Winds and AMT, the following JPL earth missions, Orbiting -Carbon Observatory (OCO) and NPP Sounder PEATE, -needed to support far more complex processing (greatly increased -data volumes and processing throughput) and various -hardware and platform configurations. This forced us to -rethink the CAS component implementation which resulted -in 1) the refactoring of the CAS component into two distinct -components, the File Manager and the Workflow Manager -and 2) the development of a third component to provide a -standard interface to various hardware and platform configurations, -the Resource Manager. -The refactoring of the CAS into the File Manager and the -Workflow Manager components solved several issues. First, -it decoupled the initiation of a workflow from the ingestion -of a file. Therefore, while workflows can be initiated based -on the ingestion of a particular file or file type, they can also -be initiated based on other events such as a specific time of -day, an operator request or a software request. Second, the -refactoring provides developers and system designers the -ability to utilize only the components they need. And third, -the refactoring supports independent evolution of the components, -and thus capabilities. The combination of these -three refactored CAS components have come to be known -as the Process Control System, or PCS. -In addition to the File Manager, Workflow Manager -and Resource Manager components that provide common -reusable capabilities for file and metadata management, -pipeline processing and job submission, we have also developed -reusable interfaces to these components to provide -additional commonly required capabilities for science data -management systems. To support the automation of file ingestion, -we have developed a configurable push-pull framework -and crawler framework. To provide easy integration -of science code in order to support all phases of algorithm -development (testbed, operations and science computing facility), -the PCS Task Wrapper has been developed. -In this paper we will describe our core PCS components, -their architecture, how they helped us solve problems on -OCO and NPP Sounder PEATE, and how they are positioning -us for the future of Earth science mission work. We believe -such work will necessitate the same spirt of architectural -reuse, understanding and mission specific adaptation -that led to the genesis of the modern PCS and that will ultimately -lead to its future evolution. We will argue in this paper -that our PCS uniquely positions us in the state of the art -in constructing large-scale, distributed, data-intensive GDS -software for NASA Earth science missions. -The rest of this paper is organized as follows. Section 2 -provides further background and related efforts in the areas -of grid computing, workflow systems and science data systems. -Section 3 describes the core PCS architectural components -in greater detail. Section 4 presents our experience -leveraging the PCS on OCO and NPP Sounder PEATE. Section -5 rounds out the paper with conclusions and highlights -our planned future work. -2 Background and Related Work -Since the development of the computational grid [8] as -a means for the virtualization and sharing of processing -and storage resources across organizational and geographic -boundaries, many groups and organizations have recognized -the power of the grid as an enabler of large-scale scientific -research. In this paper, we discuss ongoing software -projects and research initiatives relevant to the PCS. -2.1 Grid Systems -The Globus toolkit [9], developed by The Globus Alliance, -is a collection of open-source software tools for developing -distributed computing systems and applications. -The toolkit provides users with a suite of software components -and libraries that can either be used individually or -packaged together to implement the many aspects of a distributed, -service-oriented infrastructure including security, -resource and data discovery, access, and management, and -communication modules customized for a particular gridbased -effort. -2.2 Workflow Systems -The past ten years have witnessed an explosion in the -number of workflow languages and software systems developed -to support scientific workflows. Yu and Buyya [15] -attempted to taxonomize these scientific workflow systems, -largely according the underlying technologies with which -they were built. In addition to this taxonomy, Woollard, et. -al., presented a characterization of workflow systems based -the intended scientific use [14]. Specifically, the authors -classified certain workflow systems as Production Systems, -of which both the OCO and NPP Sounder PEATE ground -data systems are examples. -2.2.1 Condor -Condor [11] is a grid-based job scheduling system developed -at the University of Wisconsin Madison which aims, -among other things, to improve the effective usage of available -distributed computing and storage resources by detecting -and exploiting machine idle cycles. Condor provides -mechanisms for job queuing, setting scheduling policies, -and general resource management and monitoring. Condor -insulates users from the particulars of the details of the underlying -infrastructure by transparently handling decisions -about when and where jobs will be scheduled, monitoring -their execution, and producing notifications of completion. -While originally designed to operate in a workstation environment, -a variant of Condor, Condor-G [10], leverages the -Globus toolkit to provide a Condor implementation that is -interoperable with Globus-based grids. -2.2.2 Pegasus -Pegasus [7] is similar to Condor in that it provides a layer of -abstraction between the jobs to be processed and the hardware -that they will eventually be processed on. Developed -at the USC Information Science Pegasus is capable of dynamically -assigning computational workflows with multiple -processing steps to a large number of grid-based compute -nodes based on resource availability. In addition to generating -an initial workflow mapping, Pegasus offers the ability -to transparently remap a workflow, increasing the reliability -of the system in the event of failure in a small number of -compute nodes. -2.3 Science Data Processing Systems -Science Data Processing Systems provide the base level -of service needed to effectively manage the vast quantities -of intermediate and final data products generated by largescale, -computationally intensive research tasks. While there -are a large number of systems in operation, we focus our -discussion on those which provide services distinctly similar -to the PCS. -2.3.1 S4PA -The Simple, Scalable, Script-based Science Product -Archive (S4PA) [3], is a storage architecture developed and -deployed at NASAs Goddard Space Flight Center in support -of the operation of the Goddard Earth Science Data -and Information Services Center (GES DISC). As cost was -a primary factor in the development of S4PA, the developers -have taken pains to streamline the system. Hosting the -primary copy of all data online reduced the need for costly -physical media distribution, and utilizing the UNIX directory -structure, in combination with metadata-encoded filenames, -provides a simplified mechanism for archive and retrieval. -As its name implies, the S4PA is primarily a data archive -service. The PCS, as described in this paper, addresses data -archiving, but takes a more architecturally grounded approach, -eschewing scripts in favor of first-class architectural -components and connectors to implement complete, endto- -end data processing pipelines. Furthermore, as complete -science data processing pipelines are composed of a large -number of complimentary, interconnected services, a formal -architectural underpinning helps to provide unity and -cohesion among the constituent components. -2.4 Standards -Grid-based science data processing systems have matured -sufficiently for common themes, lessons, and challenges -to emerge among the many participants. As a result, -there are several ongoing efforts to codify the shared knowledge -and experience into formal standards. We discuss the -Open Grid Framework and the Open Archives Initiatives -Protocol for Metadata Harvesting. -2.4.1 OGF -The Open Grid Forum [2] is actively developing standards -and specifications with the goal of spreading the adoption -of grid-based software systems. The OGF is comprised -of business, government, scientific, and academic organizations -and focuses on interoperability as the key to expanding -the utilization of grids. Through both advocacy and policy, -the OGF represents an independent voice on the role of -grids, and their potential to aid modern research. -2.4.2 OAI -The Open Archives Initiative [1] also promotes standards -for interoperability and has developed, among others, the -Protocol for Metadata Harvesting (OMI-PMH). The goal -of the OMI-PMH is to improve application interoperability -by enabling consistency in the way metadata (data about -data) is exposed, accessed, and interpreted. By providing -a flexible, extensible standard interface to the rich array -of application-specific metadata currently stored in nonuniform, -distributed repositories, the OAI hopes to facilitate -the broader accessibility and usability of distributed data resources. -3 PCS Core Architecture -In this section, we describe the PCS core components. -The three PCS manager components, File Manager, Workflow -Manager, and Resource Manager, are daemon-like web -service components responsible for answering basic questions -regarding file locations, metadata, task control and -data flow, and resource availability, monitoring, and usage. -The three PCS frameworks together implement one of -two critical higher level services in data processing systems: -(1) managing the ingestion and acquisition of remotely acquired -datasets, handled via the Crawler Framework and -Push Pull components ; and (2) managing pipeline processing, -product ingestion and data production, handled via the -PCS Task Wrapper. We will describe each component in -greater detail below. The overall PCS architecture described -in this architecture is given in Fig. 1. -3.1 File Manager -The File Manager component is responsible for tracking, -ingesting and moving file data and metadata between a -client system and a server system. The File Manager is an -extensible software component that provides an XML-RPC -external interface, and a fully tailorable Java-based API for -file management. The critical objects managed by the File -Manager include: -Products - Collections of one or more files, and their associated -Metadata. -Metadata - A map of key to multiple values of descriptive -information about a Product. -References - Pointers to a Product file’s original location, -and to its final resting location within the archive constructed -by the File Manager. -Product Type - Descriptive information about a Product -that includes what type of file Uniform Resource Identifier -(URI) [5] generation scheme to use, the root -repository location for a particular Product, and a description -of the Product. -Element - A singular Metadata element, such as “Author”, -or “Creator”. Elements may have additional metadata, -in the form of the associated definition and even a corresponding -Dublin Core [4] attribute. -Versioner - A URI generation scheme for Product Types -that defines the location within the archive (built by -the File Manager) where a file belonging to a Product -(that belongs to the associated Product Type) should be -placed. -Each Product contains one or more References, and one -Metadata object. Each Product is a member of a single -Product Type. The Metadata collected for each Product is -defined by a mapping of Product Type to one or more Elements. -Each Product Type has an associated Versioner. -3.2 Workflow Manager -TheWorkflow Manager component is responsible for description, -execution, and monitoring of Workflows, using a -client, and a server system. Workflows are typically considered -to be sequences of tasks, joined together by control -flow, and data flow, that must execute in some ordered -fashion. Workflows typically generate output data, perform -routine management tasks (such as email, etc.), or describe -a business’s internal routine practices [14]. The Workflow -Manager is an extensible software component that provides -an XML-RPC external interface, and a fully tailorable Javabased -API for workflow management. The critical objects -managed by the Workflow Manager include: -Events - are what triggerWorkflows to be executed. Events -are named, and contain dynamic Metadata information, -passed in by the user. -Metadata - a dynamic set of properties, and values, provided -to a WorkflowInstance via a user-triggered -Event. -Workflow - a description of both the control flow, and data -flow of a sequence of tasks (or stages that must be executed -in some order. -Workflow Instance - an instance of a Workflow, typically -containing additional runtime descriptive information, -such as start time, end time, task wall clock time, etc. -A WorkflowInstance also contains a shared Metadata -context, passed in by the user who triggered theWorkflow. -This context can be read/written to by the underlying -WorkflowTasks, present in a Workflow. -Workflow Tasks - descriptions of data flow, and an underlying -process, or stage, that is part of a Workflow. -Workflow Task Instances - the actual executing code, or -process, that performs the work in the Workflow Task. -Workflow Task Configuration - static configuration -properties, that configure a WorkflowTask. -Workflow Conditions - any pre (or post) conditions on the -execution of a WorkflowTask. -Workflow Condition Instances - the actual executing -code, or process, that performs the work in the Workflow -Condition. -Each Event initiates one or more Workflow Instances, -providing a Metadata context (submitted by an external -user). Each Workflow Instance is a run-time execution -model of a Workflow. Each Workflow contains one or -more Workflow Tasks. Each Workflow Task contains a single -Workflow Task Configuration, and one or more Workflow -Conditions. Each Workflow Task has a corresponding -Workflow Task Instance (that it models), as does each -Workflow Condition have a corresponding Workflow Condition -Instance. -3.3 Resource Manager -The Resource Manager component is responsible for excecution, -monitoring and traacking of jobs, storage and networking -resources for an underlying set of hardware resources. -The Resource Manager is an extensible software -component that provides an XML-RPC external interface, -and a fully tailorable Java-based API for resource management. -The critical objects managed by the Resource Manager -include: -Job - an abstract representation of an execution unit, that -stores information about an underlying program, or execution -that must be run on some hardware node ,including -information about the Job Input that the Job -requires, information about the job load, and the queue -that the job should be submitted to. -Job Input - an abstrct representation of the input that a Job -requires. -Job Spec - a complete specification of a Job, including its -Job Input, and the Job definition itself. -Job Instance - the physical code that performs the underlying -job execution. -Resource Node - an available execution node that a Job is -sent to by the Resource Manager. -Each Job Spec contains exactly one Job, and Job Input. -Each Job Input is provided to a single Job. Each Job describes -a single Job Instance. And finally, each Job is sent -to exactly one Resource Node. -3.4 Crawler Framework -The Crawler Framework was an effort to standardize the -common ingestion activities that occur both in data acquisition -and archival, as well as those that occur in pipeline -processing. These types of activities regularly involve identification -of files and directories to crawl (based on e.g., -mime type, regular expressions, or direct user input), satisfaction -of ingestion pre-conditions (e.g., the current crawled -file has not been previously ingested), followed by metadata -extraction. After metadata extraction, crawled data follows -a standard three state lifecycle: (1) preIngestion - where -e.g., a file may be unzipped or pre-processed prior to ingestion; -(2) postIngest success, indicating a successful ingestion -has occurred and e.g., the origin data file from the -ingest area should be deleted; and (3) postIngest failure, indicating -that ingestion was not successful and some corrective -action, e.g,. moving the failed file to a failure area for -later examination, should occur. -To date, we have identified three types of Product -Crawlers, where each Crawler varies along the lines of customized -precondition verification, crawilng strategy, and -need for metadata extraction. The StdProductCrawler assumes -that a Metadata object has already been generated -and included with a Product prior to ingestion, so no further -work is required to generate Metadata from a Product – -the Product is ready to be ingested. The MetExtractorProductCrawler -is responsible for generating a Metadata object -dynamically, as files are encountered during the crawling -process. Finally, the AutoDetectCrawler uses a content -type identification and regular-expressions to identify Product -Types dynamically, and then defaults to the behavior of -the MetExtractorProductCrawler for Product Types identified -via content detection. The critical objects managed by -the Crawler Framework are: -Crawler Action - is attached to one or more of the three -phases, and when a ProductCrawler enters a given -phases, all the CrawlerActions attached to that phase -are executed. The valid phases are: preIngest, -postIngestSuccess and postIngestFailure. -Precondition Comparator - is used by MetExtractorProductCrawler -and AutoDetectProductCrawler. They are -part of those ProductCrawlers customized implementation -of precondition verification that identify appropriate -times to stifle or allow metadata extractor and -ultimately ingestion, to occur. -Metadata Extractor - is run by the MetExtractorProductCrawler -and the AutoDetectProductCrawler to -generate Metadata for a Product file based on some -business rules and logic. -3.5 Push Pull Framework -The Crawler Framework supports many generic ingestion -services, including metadata extraction, crawling, and -ingestion, however, one service that necessitated further -work was the development of a protocol layer allowing -a ProductCrawler to obtain content using protocol plugins -that download content using implementations of remote -protocols such as HTTP, FTP, WinNT file system, HTTPS, -etc. -The Push Pull Framework is responsible for remote data -acquisition and acceptance over modern web protocols, -such as those mentioned above. The Push Pull Framework -is flexible in that it provides the ability to plug in different -Metadata Extractors, Data Protocols, Content Types, -etc. The framework supports parallel file transfers and data -downloads, email-based push data acceptance using IMAP, -SMTP protocols, and the ability to configure “Virtual” remote -directories (based on Metadata such as Date/Time) -from which files can be downloaded. -The critical objects managed by the Push Pull Framework -are: -Retrieval Method - defines the manner in which files are -retrieved from remote sites. It is given a configuration -file, a the Parser for the file, and a FileRetrievalSystem -(which handles all the complexities of multi-threaded -file downloading). There are currently two out-of-thebox -RetrievalMethods: RemoteCrawler and ListRetriever. -RemoteCrawler is a configurable remote site -directory and file regular expression filterable crawler. -ListRetriever will download a given list of file URIs -[5]. -Parser - parses a given configuration file into a Virtual- -FileStructure which is use to filter URIs to download. -Protocol - handles file transfer and communication via -some transfer protocol. Currently implemented Protocols -include: sftp, ftp, http, imaps, file (localhost). -3.6 PCS Task Wrapper -The PCS Task Wrapper framework is responsible for -standardizing the setup, process initiation, execution and -file management tasks surrounding execution of NASA -Product Generation Executives, or PGEs. PGEs codify a -scientific algorithm, some step in the overall scientific process -involved in a mission science workflow. -The PCS Task Wrapper provides a stable operating environment -to the underlying PGE during its execution lifecycle. -If the PGE requires a file, or metadata regarding the -file, the PCS Task Wrapper is responsible for delivering that -information to the PGE in a manner that meets its requirements. -If the PGE requires knowledge of upstream or downstream -PGEs in a sequence of executions, that information -is also made available, and finally if information regarding -disk space, node information such as CPU availability, etc., -is required the PCS Task Wrapper provides this information -to the underlying PGE. After this information is collected, -the PGE is executed and its output Product file and -Metadata generation is managed via the PCS Task Wrapper -framework. The PCS Task Wrapper is responsible for -marshalling output Products and Metadata back to the File -Manager for use in downstream data processing and pedigree. -In support of this, the PCS Task Wrapper leverages -the Crawler Framework to ingest (during pipeline processing) -the output Product files and Metadata produced by the -PGE. -As can be gleaned from the above discussion, the PGE -Task Wrapper is really the unifying bridge between the execution -of a step in the overall processing pipeline, and the -available PCS component services and the information that -they collectively manage. -The critical objects managed by the PCS Task Wrapper -are: -PGETaskInstance - an abstract class which contains a -generalized set of actions usually performed when running -PGEs. Every variable and method is protected, -thus allowing subclasses to easily modify just those -generalized actions which need to be customized for -different PGE. -Pge Config File Builder - builds a PgeConfig object and -set additional Metadata which codifies the information -necessary for orchestrating a PGE through its lifecycle. -The PCS Task Wrapper is based on a simple but -powerful XML syntax which allows a scientist to simply -fill out an xml file to describe the necessary steps -to execute a PGE. -Config File Property Adder - builds the Pge Config file -object and sets custom PGE Metadata. This allows -for a general PgeConfigBuilder with different Config- -FilePropertyAdders for setting PGE specific fields in -the PgeConfig object. -Science Pge Config File Writer - passes a PGE run information -via configuration files. This object allows -for any number of config files in any desired format -to be generated describing PGE input and those files -to be delivered to the PGE. The PCS Task Wrapper -provides existing implementations, including a deFigure -1. Component Interaction Within the PCS -fault XML Stylesheet Language (XSL) Transformation -based SciPgeConfigFileWriter. -Pcs Met File Writer - aids in generating Metadata objects -associated with PGE output products. -4 Experience and Evaluation -We have successfully applied the Process Control System -(PCS) to existing NASA missions: the Orbiting Carbon -Observatory (OCO) mission, and the NPP Sounder PEATE -mission. Both systems involve tasks such as high throughput -job processing, terabyte-scale data management, and -science computing facilities. -4.1 Orbiting Carbon Observatory Mission -On OCO, the mission is using the File Manager to ingest -MODIS, CloudSat and other ancillary data products -for use in the high performance Level 2 Science Algorithm. -To date, OCO has already used the PCS software to process -over four terabytes of Fourier Transform Spectrometer -(FTS) data provided by ground-based instruments located -around the country (e.g., Park falls, Montana, and Darwin, -Australia), and has used the software to support Instrument -Thermal Vacuum (TVAC) testing, processing 100% of all -data taken by the OCO instrument during TVAC. Also, the -PCS supports a science computing facility in which variants -of scientific software can be excursive prior to inclusion in -an operations Pipeline. -4.2 NPP Sounder PEATE Mission -Specifically NPP Sounder PEATE has already used the -File Manager and Workflow Manager to ingest and process -hundreds of gigabytes of IASI data (and is in preparation to -accept CRIMS data). Also on PEATE, the PCS is currently -being used to re-catalog over fifteen million existing science -data products from the NASA AIRS missions TLSCF. -In addition, the Resource Manager will be used on NPP to -support job processing across an eighty-node cluster. -4.3 Further Applications -In addition to the two aforementioned NASA missions, -the PCS framework is being leveraged on reimbursable -work for the National Cancer Institute (NCI)’s Early Detection -Research Network (EDRN) [6]. JPL leads the informatics -efforts on EDRN, and the PCS framework is being -used in the collection, annotation and dissemination of raw -scientific data supporting the early detection of cancer to -scientists across the country. -In the next year, PCS will also be used to support a new -JPL-led NASA mission, the Soil Moisture Active Passive -(SMAP) mission. The science computing facility designs -on OCO and NPP have been used to create an algorithm -testbed for SMAP scientists early in the design phase of the -mission so that software integration risks can be mitigated -during mission development [13]. -5 Conclusions and Future Work -While the norm for earth science missions has been for -each mission to develop their own one-off science data system -from scratch, the continual decrease in mission funding -combined with the exponential increase in mission complexity -(data volume and processing throughput) over the -last decade has made this approach pass´e and risky. It was -clear that the need for a new approach was eminent. -To this end, we have developed a standards-based software -framework to provide common science data system -services that yields the benefits of reuse while remaining -adaptable to address the requirements that are unique to the -customer. This reusable software is centered around the -most basic science data system functions that support file -and metadata management, workflow management, and resource -management. Additional frameworks augment the -core capabilities to provide automation for remote data acquisition, -data ingestion and standard pipeline processing. -This reusable software framework is the Process Control -System (PCS) we have described in this paper. -While the PCS has successfully supported the Orbiting -Carbon Observatory (OCO) and NPP Sounder PEATE -missions, upcoming missions in NASAs Decadal Survey -present additional challenges. The JPL-led Soil Moisture -Active Passive (SMAP) Mission (currently in formulation -phase) will be using the PCS not only for operations, but -also for the algorithm testbed and the science computing facility. -Providing the operational infrastructure to the algorithm -team early in the mission lifecycle will greatly reduce -the cost and risk of development-to-operations for the most -costly and risky aspect of most earth science data systems, -the algorithms. However, this also means that easy integration -of algorithms and dynamic workflow specification -are our current focus for extending the PCS capabilities. -Not far behind SMAP is another JPL-led mission, Deformation, -Ecosystem Structure and Dynamics of Ice (DESDynI) -Mission. The challenges of DESDynI are requiring -us to consider the deployment of PCS components to support -a grid architecture, supporting distributed file management -and processing capabilities supported by centralized -access to a virtual science data system. -Acknowledgements -This effort was supported by the Jet Propulsion Laboratory, -managed by the California Institute of Technology -under a contract with the National Aeronautics and Space -Administration. -References -[1] Open archives initiative, http://www.openarchives.org. -[2] Open grid forum, http://www.ogf.org. -[3] S4pa, http://daac.gsfc.nasa.gov/techlab/s4pa/index.shtml. -[4] Dublin core metadata element set, 1999. -[5] T. Berners-Lee, R. Fielding, and L. Masinter. Uniform resource -identifiers (uri): Generic syntax. Technical Report -RFC 2396, 1998. -[6] D. Crichton, S. Kelly, C. Mattmann, Q. Xiao, J. S. Hughes, -J. Oh, M. Thornquist, D. Johnsey, S. Srivastava, L. Essermann, -and W. Bigbee. A distributed information services -architecture to support biomarker discovery in early detection -of cancer. In e-Science, page 44, 2006. -[7] E. Deelman, J. Blythe, Y. Gil, C. Kesselman, G. Mehta, -S. Patil, M.-H. Su, K. Vahi, and M. Livny. Pegasus: Mapping -Scientific Workflows onto the Grid. 2004. -[8] I. Foster. The anatomy of the grid: Enabling scalable virtual -organizations. pages 6–7, 2001. -[9] I. Foster. Globus toolkit version 4: Software for serviceoriented -systems. pages 2–13. 2005. -[10] J. Frey, T. Tannenbaum, M. Livny, I. Foster, and S. Tuecke. -Condor-g: A computation management agent for multiinstitutional -grids. Cluster Computing, 5(3):237–246, July -2002. -[11] M. J. Litzkow, M. Livny, and M.W. Mutka. Condor-a hunter -of idle workstations. pages 104–111, 1988. -[12] C. Mattmann, D. J. Crichton, N. Medvidovic, and -S. Hughes. A software architecture-based framework for -highly distributed and data intensive scientific applications. -In ICSE, pages 721–730, 2006. -[13] D. Woollard, O. ig Kwoun, T. Bicknell, S. Dunbar, and -K. Leung. A science data system approach for the smap -mission. In IEEE Radar, 2009. -[14] D. Woollard, N. Medvidovic, Y. Gil, and C. A. Mattmann. -Scientific software as workflows: From discovery to distribution. -Software, IEEE, 25(4):37–43, 2008. -[15] J. Yu and R. Buyya. A taxonomy of workflow management -systems for grid computing, Apr 2005. -View publication stats \ No newline at end of file +page&Paul Ramirez&依赖 +author&publication&AGGREGATION +page&22 may 2014&依赖 +enhancement&downloaded file&AGGREGATION +user&downloaded file&依赖 +user&enhancement&依赖 +Charles E. Miller Jet Propulsion Laboratory California Institute&Technology Pasadena&AGGREGATION +We&science processing pipeline&实现 +We&reusable architecture and implementation framework&依赖 +We&mission ground data system&实现 +system&software component&依赖 +Our&system& +system&software component&依赖 +archive ( cas )&QuikSCAT , SeaWinds and AMT earth science mission&依赖 +software component&component&GENERALIZATION +paper&orbiting carbon observatory (&依赖 +paper&pc&依赖 +paper&two current earth science mission&依赖 +context&two current earth science mission&AGGREGATION +paper&context&依赖 +order&magnitude&AGGREGATION +1 Introduction Data volume&order&依赖 +1 Introduction Data volume&magnitude&依赖 +1 Introduction Data volume&magnitude&依赖 +1 Introduction Data volume&magnitude&依赖 +1 Introduction Data volume&order&依赖 +1 Introduction Data volume&order&依赖 +increase power and pervasiveness&high performance computing&AGGREGATION +low cost&era&依赖 +low cost&disk storage space&AGGREGATION +low cost&era&依赖 +previously unimaginable science question&era&依赖 +previously unimaginable science question&year&依赖 +science question&sea surface temperature&依赖 +Earth&cycle& +better understanding&’s global carbon cycle and climate change&AGGREGATION +study&sea surface temperature&AGGREGATION +science question&study&依赖 +significant portion&space-based NASA earth science mission&AGGREGATION +goal&1A strategically&AGGREGATION +several focused series&step&AGGREGATION +processing&mundane activity&依赖 +processing&datum&依赖 +prospective and retrospective physical modeling&scene&AGGREGATION +processing&marshalling&依赖 +processing&datum&依赖 +processing&mundane activity&依赖 +processing&marshalling&依赖 +( removal&special space “ header ” information )&AGGREGATION +our&work& +central data base ( cdb ) subsystem&datum base management software&依赖 +datum base management software¤t and future planetary mission&依赖 +central data base ( cdb ) subsystem&mgd&AGGREGATION +central data base ( cdb ) subsystem&datum base management software&依赖 +datum base management software¤t and future planetary mission&依赖 +central data base ( cdb ) subsystem&datum base management software&依赖 +central data base ( cdb ) subsystem&datum base management software&依赖 +architectural principle&extensibility&AGGREGATION +architectural principle&ammo&依赖 +part&larger system&AGGREGATION +architectural principle&CDB&AGGREGATION +CDB&controlled , centralized system&依赖 +part&controlled , centralized system&AGGREGATION +work&clear two significant trend&依赖 +Our&work& +work&clear two significant trend&依赖 +developer&method&依赖 +developer&method&依赖 +we&“ rapid adaptation ”&依赖 +we&original CDB software&依赖 +“ rapid adaptation ”&original CDB software&AGGREGATION +technology task&task&GENERALIZATION +technology task&time&依赖 +NASA Office&Space Science&AGGREGATION +“ out&box ” core data management software service&AGGREGATION +Several author&this paper support seawind&AGGREGATION +result&collaboration&AGGREGATION +CAS component&component&GENERALIZATION +their&systems& +QuikSCAT&planned 2-year mission&依赖 +its&year& +10th year&planned 2-year mission&AGGREGATION +reprocessing and simultaneous&7 year&AGGREGATION +7 year&datum&AGGREGATION +JPL earth mission&increased data volume&依赖 +CAS component&Sea&依赖 +JPL earth mission&increased data volume&依赖 +JPL earth mission&far more complex processing (&依赖 +CAS component&Winds and AMT&依赖 +JPL earth mission&far more complex processing (&依赖 +development&a third component&AGGREGATION +refactoring&CAS component&AGGREGATION +refactoring&several issue&依赖 +refactoring&several issue&依赖 +refactoring&several issue&依赖 +refactoring&several issue&依赖 +refactoring&CAS&AGGREGATION +it&ingestion&依赖 +ingestion&file&AGGREGATION +initiation&workflow&AGGREGATION +it&initiation&依赖 +it&file&依赖 +it&workflow&依赖 +specific time&day&AGGREGATION +ingestion&particular file or file type&AGGREGATION +independent evolution&component&AGGREGATION +refactoring&component&依赖 +refactoring&independent evolution&依赖 +combination&three refactored CAS component&AGGREGATION +we&File Manager , Workflow Manager and Resource Manager component&依赖 +we&component&依赖 +we&reusable interface&依赖 +we&reusable interface&依赖 +we&configurable push-pull framework and crawler framework&依赖 +phase&algorithm development&AGGREGATION +easy integration&science code&AGGREGATION +they&future&依赖 +they&Earth science mission work&依赖 +we&core PCS component&依赖 +their&architecture& +future&Earth science mission work&AGGREGATION +we&architecture&依赖 +we&architecture&依赖 +they&us&依赖 +they&OCO&依赖 +our&components& +they&problem&依赖 +we&core PCS component&依赖 +its&evolution& +We&same spirt&依赖 +We&architectural reuse , understanding and mission specific adaptation&依赖 +genesis&modern pc&AGGREGATION +same spirt&architectural reuse , understanding and mission specific adaptation&AGGREGATION +our&uniquely& +state&art&AGGREGATION +We&us&依赖 +rest&paper&AGGREGATION +rest&follow&依赖 +area&grid computing , workflow systems and science data system&AGGREGATION +Section 3&greater detail&依赖 +Section 3&core PCS architectural component&依赖 +Section 4&our experience&依赖 +Section 4&leveraging&依赖 +our&experience& +section 5 round&paper&依赖 +section 5 round&paper&依赖 +development&virtualization and sharing&依赖 +development&organizational and geographic boundary&依赖 +virtualization and sharing&processing and storage resource&AGGREGATION +development&8 ]&依赖 +development&8 ]&依赖 +many groups and organization&power&依赖 +development&virtualization and sharing&依赖 +development&virtualization and sharing&依赖 +development&8 ]&依赖 +development&organizational and geographic boundary&依赖 +many groups and organization&grid&依赖 +development&8 ]&依赖 +development&8 ]&依赖 +development&8 ]&依赖 +development&processing and storage resource&依赖 +development&processing and storage resource&依赖 +development&processing and storage resource&依赖 +development&8 ]&依赖 +development&virtualization and sharing&依赖 +development&organizational and geographic boundary&依赖 +power&grid&AGGREGATION +development&8 ]&依赖 +many groups and organization&power&依赖 +development&processing and storage resource&依赖 +enabler&large-scale scientific research&AGGREGATION +many groups and organization&power&依赖 +development&processing and storage resource&依赖 +development&organizational and geographic boundary&依赖 +development&processing and storage resource&依赖 +development&processing and storage resource&依赖 +development&virtualization and sharing&依赖 +development&8 ]&依赖 +development&8 ]&依赖 +development&8 ]&依赖 +development&8 ]&依赖 +development&organizational and geographic boundary&依赖 +development&organizational and geographic boundary&依赖 +development&virtualization and sharing&依赖 +development&8 ]&依赖 +many groups and organization&grid&依赖 +development&organizational and geographic boundary&依赖 +development&8 ]&依赖 +development&computational grid [&AGGREGATION +development&virtualization and sharing&依赖 +many groups and organization&grid&依赖 +we&ongoing software project&依赖 +we&paper&依赖 +collection&open-source software tool&AGGREGATION +2.1 Grid Systems The Globus toolkit [ 9 ]&open-source software tool&依赖 +2.1 Grid Systems The Globus toolkit [ 9 ]&open-source software tool&依赖 +many aspect&a distribute , service-oriented infrastructure include security , resource and datum discovery&AGGREGATION +communication module&particular gridbased effort&依赖 +suite&software components and library&AGGREGATION +2.2 Workflow Systems&number&依赖 +2.2 Workflow Systems&number&依赖 +2.2 Workflow Systems&explosion&依赖 +2.2 Workflow Systems&workflow languages and software system&依赖 +2.2 Workflow Systems&workflow languages and software system&依赖 +2.2 Workflow Systems&explosion&依赖 +number&workflow languages and software system&AGGREGATION +Yu and Buyya [ 15 ]&scientific workflow system&依赖 +they&technology&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&workflow system&依赖 +characterization&workflow system&AGGREGATION +Woollard and et . al.&workflow system&依赖 +Woollard and et . al.&characterization&依赖 +Woollard and et . al.&characterization&依赖 +author&Production Systems&依赖 +OCO and NPP Sounder PEATE ground data system&which&依赖 +author&certain workflow system&依赖 +example&which&AGGREGATION +effective usage&available distributed computing and storage resource&AGGREGATION +University&Wisconsin Madison&AGGREGATION +Condor&user&依赖 +their&execution& +Condor&user&依赖 +Condor&infrastructure&依赖 +notification&completion&AGGREGATION +Condor&detail&依赖 +Condor&transparently&依赖 +detail&infrastructure&AGGREGATION +particulars&detail&AGGREGATION +variant&Condor and Condor-G [ 10 ]&AGGREGATION +variant&Globus toolkit&依赖 +variant&Globus toolkit&依赖 +variant&Globus toolkit&依赖 +layer&abstraction&AGGREGATION +large number&grid-based compute node&AGGREGATION +Pegasus&workflow&依赖 +Pegasus&workflow&依赖 +event&failure&AGGREGATION +small number&compute node&AGGREGATION +Pegasus&ability&依赖 +Pegasus&ability&依赖 +Pegasus&ability&依赖 +Pegasus&workflow&依赖 +reliability&system&AGGREGATION +vast quantity&intermediate and final data product&AGGREGATION +base level&service&AGGREGATION +large number&system&AGGREGATION +our&discussion& +support&operation&AGGREGATION +operation&Goddard Earth Science Data&AGGREGATION +developer&pain&依赖 +cost&development&依赖 +cost&S4PA&依赖 +development&S4PA&AGGREGATION +primary copy&all datum online&AGGREGATION +its&name& +favor&first-class architectural component and connector&AGGREGATION +large number&complimentary , interconnected service&AGGREGATION +complete science data processing pipeline&complimentary , interconnected service&依赖 +complete science data processing pipeline&large number&依赖 +We&Open Grid Framework&依赖 +2.4.1 OGF The Open Grid Forum [ 2 ]&standards and specification&依赖 +2.4.1 OGF The Open Grid Forum [ 2 ]&grid-based software system&依赖 +2.4.1 OGF The Open Grid Forum [ 2 ]&standards and specification&依赖 +2.4.1 OGF The Open Grid Forum [ 2 ]&adoption&依赖 +adoption&grid-based software system&AGGREGATION +OGF&business , government , scientific , and academic organization&依赖 +utilization&grid&AGGREGATION +OGF&role&依赖 +OGF&grid&依赖 +role&grid&AGGREGATION +OGF&independent voice&依赖 +OGF&grid&依赖 +their&potential& +OGF&independent voice&依赖 +OGF&role&依赖 +2.4.2 OAI The Open Archives Initiative [ 1 ]&interoperability&依赖 +2.4.2 OAI The Open Archives Initiative [ 1 ]&standard&依赖 +goal&application interoperability&依赖 +goal&application interoperability&依赖 +application interoperability&interoperability&GENERALIZATION +goal&OMI-PMH&AGGREGATION +OAI&broader accessibility and usability&依赖 +broader accessibility and usability&distributed data resource&AGGREGATION +rich array&application-specific metada&AGGREGATION +OAI&distributed data resource&依赖 +we§ion&依赖 +we&PCS core component&依赖 +we&3 PCS Core Architecture&依赖 +2 ) managing pipeline processing&PCS Task Wrapper&依赖 +2 ) managing pipeline processing&PCS Task Wrapper&依赖 +three PCS framework&two critical higher level service&实现 +one&two critical higher level service&AGGREGATION +three PCS framework&one&实现 +three PCS framework&two critical higher level service ###&实现 +We&component&依赖 +We&below&依赖 +We&greater detail&依赖 +overall PCS architecture&Fig. 1&依赖 +critical object&Products&依赖 +critical object&collection&依赖 +collection&one or more file&AGGREGATION +critical object&one or more file&依赖 +critical object&one or more file&依赖 +critical object&Products&依赖 +critical object&collection&依赖 +their&Metadata& +A map&key&AGGREGATION +multiple value&descriptive information&AGGREGATION +its&location& +file&location& +File Manager&use&依赖 +File Manager&what type&依赖 +what type&file uniform resource identifier ( uri ) [ 5 ] generation scheme&AGGREGATION +File Manager&file uniform resource identifier ( uri ) [ 5 ] generation scheme&依赖 +description&Product&AGGREGATION +File Manager&use&依赖 +File Manager&file uniform resource identifier ( uri ) [ 5 ] generation scheme&依赖 +element&form&依赖 +element&associated definition&依赖 +form&associated definition&AGGREGATION +element&additional metada&依赖 +A URI generation scheme&location&依赖 +A URI generation scheme&built&依赖 +A URI generation scheme&archive (&依赖 +Product&one or more reference&依赖 +member&single Product Type&AGGREGATION +Product&single Product Type&依赖 +Metadata&mapping&依赖 +mapping&Product Type&AGGREGATION +Metadata&one or more element&依赖 +Product Type&associated Versioner&依赖 +description , execution , and monitoring&Workflows&AGGREGATION +3.2 Workflow Manager TheWorkflow Manager component&Workflows&依赖 +3.2 Workflow Manager TheWorkflow Manager component&description , execution , and monitoring&依赖 +sequence&task&AGGREGATION +Workflows&output datum&依赖 +business&practices& +event&what triggerworkflow&依赖 +dynamic set&property&AGGREGATION +description&control flow&AGGREGATION +data flow&sequence&AGGREGATION +instance&Workflow&AGGREGATION +WorkflowInstance&shared Metadata context&依赖 +who&theWorkflow&依赖 +description&data flow&AGGREGATION +part&Workflow&AGGREGATION +static configuration property&WorkflowTask&依赖 +execution&WorkflowTask&AGGREGATION +Event&one or more Workflow Instances&依赖 +Workflow Instance&Workflow&依赖 +run-time execution model&Workflow&AGGREGATION +Workflow&one or more Workflow Tasks&依赖 +Workflow Task&single Workflow Task Configuration&依赖 +Workflow Task&model&依赖 +Workflow Task&a corresponding workflow task instance (&依赖 +Workflow Condition&corresponding Workflow Condition Instance&依赖 +3.3 Resource Manager The Resource Manager component&hardware resource&依赖 +set&hardware resource&AGGREGATION +3.3 Resource Manager The Resource Manager component&jobs , storage and networking resource&依赖 +3.3 Resource Manager The Resource Manager component&excecution , monitoring and traacking&依赖 +3.3 Resource Manager The Resource Manager component&set&依赖 +excecution , monitoring and traacking&jobs , storage and networking resource&AGGREGATION +abstract representation&execution unit&AGGREGATION +Job&that&依赖 +abstrct representation&input&AGGREGATION +its&Input& +complete specification&Job&AGGREGATION +physical code&job execution&依赖 +Job Spec&exactly one job&依赖 +Job&single Job Instance&依赖 +Job&one Resource Node&依赖 +satisfaction&ingestion pre-condition&AGGREGATION +type&files and directory&依赖 +type&files and directory&依赖 +type&activity&AGGREGATION +type&identification&依赖 +type&identification&依赖 +identification&files and directory&AGGREGATION +crawled datum&standard three state lifecycle&依赖 +crawled datum&standard three state lifecycle&依赖 +Crawler&customized precondition verification&依赖 +we&Product Crawlers&依赖 +Crawler&line&依赖 +three type&Product Crawlers&AGGREGATION +Crawler&crawilng strategy&依赖 +we&three type&依赖 +line&customized precondition verification&AGGREGATION +we&Product Crawlers&依赖 +we&three type&依赖 +file&process&依赖 +behavior&MetExtractorProductCrawler&AGGREGATION +AutoDetectCrawler&content type identification&依赖 +critical object&three phase&依赖 +ProductCrawler&given phase&依赖 +one or more&three phase&AGGREGATION +critical object&one or more&依赖 +Precondition Comparator -&MetExtractorProductCrawler and AutoDetectProductCrawler&依赖 +part&ProductCrawlers customized implementation&AGGREGATION +They&ProductCrawlers customized implementation&依赖 +They&precondition verification&依赖 +ProductCrawlers customized implementation&appropriate time&依赖 +ProductCrawlers customized implementation&precondition verification&AGGREGATION +protocol plugin&HTTP&实现 +implementation&remote protocol&AGGREGATION +one service&protocol layer&依赖 +one service&ProductCrawler&依赖 +3.5 push pull framework crawler framework&3.5 push pull framework crawler framework&依赖 +one service&further work&依赖 +protocol plugin&implementation&依赖 +development&protocol layer&AGGREGATION +protocol plugin&remote protocol&实现 +Push Pull Framework&modern web protocol&依赖 +Push Pull Framework&remote datum acquisition and acceptance&依赖 +Push Pull Framework&modern web protocol such ###&依赖 +email-based push datum acceptance&IMAP&依赖 +file&which&依赖 +Retrieval Method&manner&依赖 +file&manner&依赖 +file&remote site&依赖 +complexity&multi-threaded file download )&AGGREGATION +a fileretrievalsystem (&multi-threaded file download )&依赖 +configuration file&file&GENERALIZATION +a fileretrievalsystem (&complexity&依赖 +ListRetriever&given list&依赖 +ListRetriever&file URIs [ 5 ]&依赖 +given list&file URIs [ 5 ]&AGGREGATION +Virtual&filter uri&依赖 +Protocol -&transfer protocol&依赖 +Protocol -&file transfer and communication&依赖 +execution&NASA Product Generation executive&AGGREGATION +pge&step&依赖 +pge&overall scientific process&依赖 +pge&scientific algorithm&依赖 +its&lifecycle& +PGE&file , or metada&依赖 +manner&requirement&依赖 +its&requirements& +PGE&file&依赖 +sequence&execution&AGGREGATION +PGE&upstream or downstream pge&依赖 +PGE&knowledge&依赖 +knowledge&upstream or downstream pge&AGGREGATION +PGE&knowledge&依赖 +PGE&upstream or downstream pge&依赖 +PGE&upstream or downstream pge&依赖 +PGE&knowledge&依赖 +its&file& +output Product file and Metadata generation&PCS Task Wrapper framework&依赖 +PCS Task Wrapper&output Product file&依赖 +PCS Task Wrapper&Crawler Framework&依赖 +execution&step&AGGREGATION +generalized set&action&AGGREGATION +abstract class&generalized set&依赖 +abstract class&action&依赖 +Pge Config File Builder&PgeConfig object&依赖 +additional Metadata&information&依赖 +lifecycle&xml file&实现 +PCS Task Wrapper&simple but powerful XML syntax&依赖 +xml file&file&GENERALIZATION +lifecycle&scientist&实现 +Config File Property Adder&Pge Config file object&依赖 +Science Pge Config File Writer&PGE run information&依赖 +Science Pge Config File Writer&configuration file&依赖 +Component Interaction&( xsl ) transformation&依赖 +Component Interaction&SciPgeConfigFileWriter&依赖 +We&NASA mission&依赖 +We&control system ( pc&依赖 +system&high throughput job processing&依赖 +mission&File Manager&依赖 +mission&4.1 Orbiting Carbon Observatory Mission&依赖 +OCO&four terabyte&依赖 +four terabyte&) datum ( ft&AGGREGATION +100 %&datum&AGGREGATION +OCO&PCS software&依赖 +OCO&) datum&依赖 +variant&scientific software&AGGREGATION +variant&science computing facility&依赖 +variant&science computing facility&依赖 +pc&science computing facility&依赖 +gigabyte&iasi datum (&AGGREGATION +hundred&gigabyte&AGGREGATION +4.2 NPP Sounder PEATE Mission Specifically NPP Sounder PEATE&File Manager and Workflow Manager&依赖 +PCS framework&addition&依赖 +PCS framework&4.3 Further application&依赖 +PCS framework&two aforementioned NASA mission&依赖 +PCS framework&’s early detection research network ( edrn ) [ 6 ]&依赖 +work&Network& +JPL&EDRN&依赖 +PCS framework&collection , annotation and dissemination&依赖 +PCS framework&raw scientific datum&依赖 +PCS framework&early detection&依赖 +collection , annotation and dissemination&raw scientific datum&AGGREGATION +early detection&cancer&AGGREGATION +JPL&informatics effort&依赖 +PCS framework&cancer&依赖 +NASA mission&mission&GENERALIZATION +software integration risk&mission development [ 13 ]&依赖 +design phase&mission&AGGREGATION +norm&scratch&依赖 +their&system& +norm&scratch&依赖 +common science data system service&benefit&依赖 +we&standards-based software framework&依赖 +we&end&依赖 +common science data system service&reuse&依赖 +adaptable&requirement&依赖 +benefit&reuse&AGGREGATION +reusable software&most basic science data system&依赖 +most basic science data system&metada management&依赖 +Additional framework&core capability&依赖 +we&paper&依赖 +mission&nasas decadal survey present additional challenge&依赖 +mission&nasas decadal survey present additional challenge&依赖 +mission&orbiting carbon observatory ( oco ) and npp sounder peate mission&依赖 +mission&nasas decadal survey present additional challenge&依赖 +pc&orbiting carbon observatory ( oco ) and npp sounder peate mission&依赖 +mission&orbiting carbon observatory ( oco ) and npp sounder peate mission&依赖 +mission&orbiting carbon observatory ( oco ) and npp sounder peate mission&依赖 +jpl-led soil moisture active passive ( smap ) mission (&operation&依赖 +jpl-led soil moisture active passive ( smap ) mission (&pc&依赖 +jpl-led soil moisture active passive ( smap ) mission (&operation&依赖 +jpl-led soil moisture active passive ( smap ) mission (&pc&依赖 +most earth science datum system&algorithm team&依赖 +most earth science datum system&algorithm team&依赖 +most earth science datum system&operational infrastructure&依赖 +most costly and risky aspect&most earth science datum system&AGGREGATION +most earth science datum system&operational infrastructure&依赖 +cost and risk&development-to-operation&AGGREGATION +our&focus& +easy integration&algorithm&AGGREGATION +JPL-led mission&SMAP&依赖 +Dynamics&ice ( desdynus ) mission&AGGREGATION +challenge&desdynus&AGGREGATION +challenge&deployment&依赖 +challenge&deployment&依赖 +challenge&us&依赖 +deployment&PCS component&AGGREGATION +challenge&PCS component&依赖 +challenge&us&依赖 +challenge&PCS component&依赖 +Acknowledgements This effort&Jet Propulsion Laboratory&依赖 +California Institute&Technology&AGGREGATION +reference&1 ] open archive initiative , http://www.openarchives.org&依赖 +technical report rfc 2396 , 1998&technical report rfc 2396 , 1998&依赖 +a distribute information service architecture&early detection&依赖 +a distribute information service architecture&cancer&依赖 +a distribute information service architecture&a distribute information service architecture&依赖 +a distribute information service architecture&cancer&依赖 +a distribute information service architecture&biomarker discovery&依赖 +a distribute information service architecture&early detection&依赖 +a distribute information service architecture&cancer&依赖 +a distribute information service architecture&a distribute information service architecture&依赖 +a distribute information service architecture&a distribute information service architecture&依赖 +a distribute information service architecture&cancer&依赖 +a distribute information service architecture&a distribute information service architecture&依赖 +a distribute information service architecture&biomarker discovery&依赖 +a distribute information service architecture&early detection&依赖 +a distribute information service architecture&early detection&依赖 +a distribute information service architecture&biomarker discovery&依赖 +a distribute information service architecture&biomarker discovery&依赖 +anatomy&grid&AGGREGATION +Condor-a hunter&idle workstation&AGGREGATION +a software architecture-based framework&a software architecture-based framework&依赖 +a software architecture-based framework&a software architecture-based framework&依赖 +a software architecture-based framework&highly distribute and datum intensive scientific application&依赖 +a software architecture-based framework&a software architecture-based framework&依赖 +a software architecture-based framework&highly distribute and datum intensive scientific application&依赖 +a software architecture-based framework&highly distribute and datum intensive scientific application&依赖 +a software architecture-based framework&highly distribute and datum intensive scientific application&依赖 +a software architecture-based framework&a software architecture-based framework&依赖 +page&ICSE&依赖 +a science datum system approach&smap mission&依赖 +smap mission&mission&GENERALIZATION +a science datum system approach&smap mission&依赖 +taxonomy&apr 2005&依赖 +taxonomy&apr 2005&依赖 +taxonomy&workflow management system&AGGREGATION +taxonomy&apr 2005&依赖 +taxonomy&apr 2005&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-simEnts.txt deleted file mode 100644 index de3ecb351d1cf7a834e774d043c5e3cc5845f0eb..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions-simEnts.txt +++ /dev/null @@ -1,716 +0,0 @@ -See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/232619682 -A Reusable Process Control System Framework for the Orbiting Carbon -Observatory and NPP Sounder PEATE missions -Article · July 2009 -DOI: 10.1109/SMC-IT.2009.27 -CITATIONS -21 -READS -90 -11 authors, including: -Some of the authors of this publication are also working on these related projects: -Airborne Snow Observatory View project -The Planetary Data System PDS4 Information Model-Driven Architecture View project -Daniel J. Crichton -NASA -138 PUBLICATIONS 791 CITATIONS -SEE PROFILE -Sean Hardman -NASA -20 PUBLICATIONS 53 CITATIONS -SEE PROFILE -Paul Ramirez -NASA -39 PUBLICATIONS 408 CITATIONS -SEE PROFILE -Sean Colin-Patrick Kelly -NASA -43 PUBLICATIONS 213 CITATIONS -SEE PROFILE -All content following this page was uploaded by Paul Ramirez on 22 May 2014. -The user has requested enhancement of the downloaded file. -A Reusable Process Control System Framework for the Orbiting Carbon -Observatory and NPP Sounder PEATE missions -Chris A. Mattmann, Dana Freeborn, Dan Crichton, Brian Foster, -Andrew Hart, David Woollard, Sean Hardman, Paul Ramirez, -Sean Kelly, Albert Y. Chang, Charles E. Miller -Jet Propulsion Laboratory -California Institute of Technology -Pasadena, CA 91109, USA -mattmann@jpl.nasa.gov -Abstract -We describe a reusable architecture and implementation -framework for managing science processing pipelines for -mission ground data systems. Our system, dubbed “PCS”, -for Process Control System, improves upon an existing software -component, the OODT Catalog and Archive (CAS), -which has already supported the QuikSCAT, SeaWinds and -AMT earth science missions. This paper focuses on PCS -within the context of two current earth science missions: the -Orbiting Carbon Observatory (OCO), and NPP Sounder -PEATE projects. -1 Introduction -Data volume and computational needs for Earth science -missions at NASA are growing by orders of magnitude. The -low cost of disk storage space and the increasing power -and pervasiveness of high performance computing have engendered -an era in which previously unimaginable science -questions can be answered in years rather than decades. -These science questions range from the study of sea surface -temperatures to observe maritime pollution, to measuring -atmospheric chemical composition for weather forecasting, -to obtaining a better understanding of the Earth’s global carbon -cycle and climate change as a whole. -A significant portion of any space-based NASA earth -science mission is a Ground Data System (GDS). The GDS -is responsible for receiving raw spacecraft data as delivered -from a ground station1, and processing the information -through several focused series of steps with the goal of -1A strategically placed data center on Earth with ample ground-tospace -bandwidth and connectivity for receiving satellite data. -delivering the scientific value encoded in the data to interested -scientists, both locally at an instrument team center, -and then to universities, decision makers, and the broader -science community. The processing that a GDS must perform -ranges from mundane activities including data (un- -)marshalling (removal of special space “header” information), -and subsetting, to more involved processes including -temporal and spatial positioning, calibration, and statistical -analysis, to complex scientific assimilation including -prospective and retrospective physical modeling of a scene. -Beginning with Automated Multi-Mission Operations -System (AMMOS) Multi-mission Ground Data System -(MGDS) in the early 1990s, our work has focused on building -reusable software components for GDS systems. As -an example, the Central Data Base (CDB) Subsystem of the -MGDS included data base management software comprised -of metadata and file management, file transfer capabilities, -user interfaces and data storage facilities to support multimission -telemetry data streams for current and future planetary -missions. This demanded that the CDB architecture -adhere to the architectural principles of extensibility, scalability, -and reusability. Because the CDB was and is part of -a larger system that included controlled, centralized hardware, -these architectural principles of CDB were satisfied -for AMMOS by simply ensuring that the CDB was data and -policy driven. -Our ensuing work on the Alaska SAR Facility (ASF) and -NASA Scatterometer (NSCAT) projects, made clear two -significant trends: 1) neither of these missions were part -of the controlled, centralized system for which the CDB -was developed and 2) the data management requirements -for these two missions were different from each other and -AMMOS. This meant that 1) hardware and platform choices -could not be assumed and 2) additional capabilities not originally -required for AMMOS had to be developed. In order -to meet mission schedule and cost constraints, developers -for each project independently employed a method we -coined “rapid adaptation” of the original CDB software that -resulted in two very successful mission data systems with -ultimately very few similarities or shared code. -At the time the NSCAT follow-on mission (SeaWinds on -ADEOS II) was ramping up, a technology task originally -funded by the NASA Office of Space Science was focused -on architecting and developing a common, standards-based -software framework dubbed Object Oriented Data Technology -(OODT) [12]. OODT provided “out of the box” core -data management software services while remaining adaptable -to address the (potentially evolving) requirements that -are unique from mission to mission. -Several authors of this paper supporting SeaWinds and -the OODT technology task decided to collaborate to create -a platform- and database-independent service for managing -files and tasks. The result of this collaboration was -the OODT Catalog and Archive Service component that -was architected to be reusable, reliable and scalable. The -SeaWinds (on QuikSCAT and ADEOS II) and Advanced -Communications Technology Satellite (ACTS) Mobile Terminal -(AMT) projects benefited greatly from employing -the CAS component to support their science data systems. -QuikSCAT is in its 10th year of a planned 2-year mission -and is continuing to function in a nearly lights out mode. -Hardware has been added to the system to support the unplanned -data and processing volumes (reprocessing of 7 -years of data completed in 6 months, simultaneous with -daily operations) by simply changing the software configuration. -No software engineers were required to extend the -system. -While the CAS component successfully supported Sea- -Winds and AMT, the following JPL earth missions, Orbiting -Carbon Observatory (OCO) and NPP Sounder PEATE, -needed to support far more complex processing (greatly increased -data volumes and processing throughput) and various -hardware and platform configurations. This forced us to -rethink the CAS component implementation which resulted -in 1) the refactoring of the CAS component into two distinct -components, the File Manager and the Workflow Manager -and 2) the development of a third component to provide a -standard interface to various hardware and platform configurations, -the Resource Manager. -The refactoring of the CAS into the File Manager and the -Workflow Manager components solved several issues. First, -it decoupled the initiation of a workflow from the ingestion -of a file. Therefore, while workflows can be initiated based -on the ingestion of a particular file or file type, they can also -be initiated based on other events such as a specific time of -day, an operator request or a software request. Second, the -refactoring provides developers and system designers the -ability to utilize only the components they need. And third, -the refactoring supports independent evolution of the components, -and thus capabilities. The combination of these -three refactored CAS components have come to be known -as the Process Control System, or PCS. -In addition to the File Manager, Workflow Manager -and Resource Manager components that provide common -reusable capabilities for file and metadata management, -pipeline processing and job submission, we have also developed -reusable interfaces to these components to provide -additional commonly required capabilities for science data -management systems. To support the automation of file ingestion, -we have developed a configurable push-pull framework -and crawler framework. To provide easy integration -of science code in order to support all phases of algorithm -development (testbed, operations and science computing facility), -the PCS Task Wrapper has been developed. -In this paper we will describe our core PCS components, -their architecture, how they helped us solve problems on -OCO and NPP Sounder PEATE, and how they are positioning -us for the future of Earth science mission work. We believe -such work will necessitate the same spirt of architectural -reuse, understanding and mission specific adaptation -that led to the genesis of the modern PCS and that will ultimately -lead to its future evolution. We will argue in this paper -that our PCS uniquely positions us in the state of the art -in constructing large-scale, distributed, data-intensive GDS -software for NASA Earth science missions. -The rest of this paper is organized as follows. Section 2 -provides further background and related efforts in the areas -of grid computing, workflow systems and science data systems. -Section 3 describes the core PCS architectural components -in greater detail. Section 4 presents our experience -leveraging the PCS on OCO and NPP Sounder PEATE. Section -5 rounds out the paper with conclusions and highlights -our planned future work. -2 Background and Related Work -Since the development of the computational grid [8] as -a means for the virtualization and sharing of processing -and storage resources across organizational and geographic -boundaries, many groups and organizations have recognized -the power of the grid as an enabler of large-scale scientific -research. In this paper, we discuss ongoing software -projects and research initiatives relevant to the PCS. -2.1 Grid Systems -The Globus toolkit [9], developed by The Globus Alliance, -is a collection of open-source software tools for developing -distributed computing systems and applications. -The toolkit provides users with a suite of software components -and libraries that can either be used individually or -packaged together to implement the many aspects of a distributed, -service-oriented infrastructure including security, -resource and data discovery, access, and management, and -communication modules customized for a particular gridbased -effort. -2.2 Workflow Systems -The past ten years have witnessed an explosion in the -number of workflow languages and software systems developed -to support scientific workflows. Yu and Buyya [15] -attempted to taxonomize these scientific workflow systems, -largely according the underlying technologies with which -they were built. In addition to this taxonomy, Woollard, et. -al., presented a characterization of workflow systems based -the intended scientific use [14]. Specifically, the authors -classified certain workflow systems as Production Systems, -of which both the OCO and NPP Sounder PEATE ground -data systems are examples. -2.2.1 Condor -Condor [11] is a grid-based job scheduling system developed -at the University of Wisconsin Madison which aims, -among other things, to improve the effective usage of available -distributed computing and storage resources by detecting -and exploiting machine idle cycles. Condor provides -mechanisms for job queuing, setting scheduling policies, -and general resource management and monitoring. Condor -insulates users from the particulars of the details of the underlying -infrastructure by transparently handling decisions -about when and where jobs will be scheduled, monitoring -their execution, and producing notifications of completion. -While originally designed to operate in a workstation environment, -a variant of Condor, Condor-G [10], leverages the -Globus toolkit to provide a Condor implementation that is -interoperable with Globus-based grids. -2.2.2 Pegasus -Pegasus [7] is similar to Condor in that it provides a layer of -abstraction between the jobs to be processed and the hardware -that they will eventually be processed on. Developed -at the USC Information Science Pegasus is capable of dynamically -assigning computational workflows with multiple -processing steps to a large number of grid-based compute -nodes based on resource availability. In addition to generating -an initial workflow mapping, Pegasus offers the ability -to transparently remap a workflow, increasing the reliability -of the system in the event of failure in a small number of -compute nodes. -2.3 Science Data Processing Systems -Science Data Processing Systems provide the base level -of service needed to effectively manage the vast quantities -of intermediate and final data products generated by largescale, -computationally intensive research tasks. While there -are a large number of systems in operation, we focus our -discussion on those which provide services distinctly similar -to the PCS. -2.3.1 S4PA -The Simple, Scalable, Script-based Science Product -Archive (S4PA) [3], is a storage architecture developed and -deployed at NASAs Goddard Space Flight Center in support -of the operation of the Goddard Earth Science Data -and Information Services Center (GES DISC). As cost was -a primary factor in the development of S4PA, the developers -have taken pains to streamline the system. Hosting the -primary copy of all data online reduced the need for costly -physical media distribution, and utilizing the UNIX directory -structure, in combination with metadata-encoded filenames, -provides a simplified mechanism for archive and retrieval. -As its name implies, the S4PA is primarily a data archive -service. The PCS, as described in this paper, addresses data -archiving, but takes a more architecturally grounded approach, -eschewing scripts in favor of first-class architectural -components and connectors to implement complete, endto- -end data processing pipelines. Furthermore, as complete -science data processing pipelines are composed of a large -number of complimentary, interconnected services, a formal -architectural underpinning helps to provide unity and -cohesion among the constituent components. -2.4 Standards -Grid-based science data processing systems have matured -sufficiently for common themes, lessons, and challenges -to emerge among the many participants. As a result, -there are several ongoing efforts to codify the shared knowledge -and experience into formal standards. We discuss the -Open Grid Framework and the Open Archives Initiatives -Protocol for Metadata Harvesting. -2.4.1 OGF -The Open Grid Forum [2] is actively developing standards -and specifications with the goal of spreading the adoption -of grid-based software systems. The OGF is comprised -of business, government, scientific, and academic organizations -and focuses on interoperability as the key to expanding -the utilization of grids. Through both advocacy and policy, -the OGF represents an independent voice on the role of -grids, and their potential to aid modern research. -2.4.2 OAI -The Open Archives Initiative [1] also promotes standards -for interoperability and has developed, among others, the -Protocol for Metadata Harvesting (OMI-PMH). The goal -of the OMI-PMH is to improve application interoperability -by enabling consistency in the way metadata (data about -data) is exposed, accessed, and interpreted. By providing -a flexible, extensible standard interface to the rich array -of application-specific metadata currently stored in nonuniform, -distributed repositories, the OAI hopes to facilitate -the broader accessibility and usability of distributed data resources. -3 PCS Core Architecture -In this section, we describe the PCS core components. -The three PCS manager components, File Manager, Workflow -Manager, and Resource Manager, are daemon-like web -service components responsible for answering basic questions -regarding file locations, metadata, task control and -data flow, and resource availability, monitoring, and usage. -The three PCS frameworks together implement one of -two critical higher level services in data processing systems: -(1) managing the ingestion and acquisition of remotely acquired -datasets, handled via the Crawler Framework and -Push Pull components ; and (2) managing pipeline processing, -product ingestion and data production, handled via the -PCS Task Wrapper. We will describe each component in -greater detail below. The overall PCS architecture described -in this architecture is given in Fig. 1. -3.1 File Manager -The File Manager component is responsible for tracking, -ingesting and moving file data and metadata between a -client system and a server system. The File Manager is an -extensible software component that provides an XML-RPC -external interface, and a fully tailorable Java-based API for -file management. The critical objects managed by the File -Manager include: -Products - Collections of one or more files, and their associated -Metadata. -Metadata - A map of key to multiple values of descriptive -information about a Product. -References - Pointers to a Product file’s original location, -and to its final resting location within the archive constructed -by the File Manager. -Product Type - Descriptive information about a Product -that includes what type of file Uniform Resource Identifier -(URI) [5] generation scheme to use, the root -repository location for a particular Product, and a description -of the Product. -Element - A singular Metadata element, such as “Author”, -or “Creator”. Elements may have additional metadata, -in the form of the associated definition and even a corresponding -Dublin Core [4] attribute. -Versioner - A URI generation scheme for Product Types -that defines the location within the archive (built by -the File Manager) where a file belonging to a Product -(that belongs to the associated Product Type) should be -placed. -Each Product contains one or more References, and one -Metadata object. Each Product is a member of a single -Product Type. The Metadata collected for each Product is -defined by a mapping of Product Type to one or more Elements. -Each Product Type has an associated Versioner. -3.2 Workflow Manager -TheWorkflow Manager component is responsible for description, -execution, and monitoring of Workflows, using a -client, and a server system. Workflows are typically considered -to be sequences of tasks, joined together by control -flow, and data flow, that must execute in some ordered -fashion. Workflows typically generate output data, perform -routine management tasks (such as email, etc.), or describe -a business’s internal routine practices [14]. The Workflow -Manager is an extensible software component that provides -an XML-RPC external interface, and a fully tailorable Javabased -API for workflow management. The critical objects -managed by the Workflow Manager include: -Events - are what triggerWorkflows to be executed. Events -are named, and contain dynamic Metadata information, -passed in by the user. -Metadata - a dynamic set of properties, and values, provided -to a WorkflowInstance via a user-triggered -Event. -Workflow - a description of both the control flow, and data -flow of a sequence of tasks (or stages that must be executed -in some order. -Workflow Instance - an instance of a Workflow, typically -containing additional runtime descriptive information, -such as start time, end time, task wall clock time, etc. -A WorkflowInstance also contains a shared Metadata -context, passed in by the user who triggered theWorkflow. -This context can be read/written to by the underlying -WorkflowTasks, present in a Workflow. -Workflow Tasks - descriptions of data flow, and an underlying -process, or stage, that is part of a Workflow. -Workflow Task Instances - the actual executing code, or -process, that performs the work in the Workflow Task. -Workflow Task Configuration - static configuration -properties, that configure a WorkflowTask. -Workflow Conditions - any pre (or post) conditions on the -execution of a WorkflowTask. -Workflow Condition Instances - the actual executing -code, or process, that performs the work in the Workflow -Condition. -Each Event initiates one or more Workflow Instances, -providing a Metadata context (submitted by an external -user). Each Workflow Instance is a run-time execution -model of a Workflow. Each Workflow contains one or -more Workflow Tasks. Each Workflow Task contains a single -Workflow Task Configuration, and one or more Workflow -Conditions. Each Workflow Task has a corresponding -Workflow Task Instance (that it models), as does each -Workflow Condition have a corresponding Workflow Condition -Instance. -3.3 Resource Manager -The Resource Manager component is responsible for excecution, -monitoring and traacking of jobs, storage and networking -resources for an underlying set of hardware resources. -The Resource Manager is an extensible software -component that provides an XML-RPC external interface, -and a fully tailorable Java-based API for resource management. -The critical objects managed by the Resource Manager -include: -Job - an abstract representation of an execution unit, that -stores information about an underlying program, or execution -that must be run on some hardware node ,including -information about the Job Input that the Job -requires, information about the job load, and the queue -that the job should be submitted to. -Job Input - an abstrct representation of the input that a Job -requires. -Job Spec - a complete specification of a Job, including its -Job Input, and the Job definition itself. -Job Instance - the physical code that performs the underlying -job execution. -Resource Node - an available execution node that a Job is -sent to by the Resource Manager. -Each Job Spec contains exactly one Job, and Job Input. -Each Job Input is provided to a single Job. Each Job describes -a single Job Instance. And finally, each Job is sent -to exactly one Resource Node. -3.4 Crawler Framework -The Crawler Framework was an effort to standardize the -common ingestion activities that occur both in data acquisition -and archival, as well as those that occur in pipeline -processing. These types of activities regularly involve identification -of files and directories to crawl (based on e.g., -mime type, regular expressions, or direct user input), satisfaction -of ingestion pre-conditions (e.g., the current crawled -file has not been previously ingested), followed by metadata -extraction. After metadata extraction, crawled data follows -a standard three state lifecycle: (1) preIngestion - where -e.g., a file may be unzipped or pre-processed prior to ingestion; -(2) postIngest success, indicating a successful ingestion -has occurred and e.g., the origin data file from the -ingest area should be deleted; and (3) postIngest failure, indicating -that ingestion was not successful and some corrective -action, e.g,. moving the failed file to a failure area for -later examination, should occur. -To date, we have identified three types of Product -Crawlers, where each Crawler varies along the lines of customized -precondition verification, crawilng strategy, and -need for metadata extraction. The StdProductCrawler assumes -that a Metadata object has already been generated -and included with a Product prior to ingestion, so no further -work is required to generate Metadata from a Product – -the Product is ready to be ingested. The MetExtractorProductCrawler -is responsible for generating a Metadata object -dynamically, as files are encountered during the crawling -process. Finally, the AutoDetectCrawler uses a content -type identification and regular-expressions to identify Product -Types dynamically, and then defaults to the behavior of -the MetExtractorProductCrawler for Product Types identified -via content detection. The critical objects managed by -the Crawler Framework are: -Crawler Action - is attached to one or more of the three -phases, and when a ProductCrawler enters a given -phases, all the CrawlerActions attached to that phase -are executed. The valid phases are: preIngest, -postIngestSuccess and postIngestFailure. -Precondition Comparator - is used by MetExtractorProductCrawler -and AutoDetectProductCrawler. They are -part of those ProductCrawlers customized implementation -of precondition verification that identify appropriate -times to stifle or allow metadata extractor and -ultimately ingestion, to occur. -Metadata Extractor - is run by the MetExtractorProductCrawler -and the AutoDetectProductCrawler to -generate Metadata for a Product file based on some -business rules and logic. -3.5 Push Pull Framework -The Crawler Framework supports many generic ingestion -services, including metadata extraction, crawling, and -ingestion, however, one service that necessitated further -work was the development of a protocol layer allowing -a ProductCrawler to obtain content using protocol plugins -that download content using implementations of remote -protocols such as HTTP, FTP, WinNT file system, HTTPS, -etc. -The Push Pull Framework is responsible for remote data -acquisition and acceptance over modern web protocols, -such as those mentioned above. The Push Pull Framework -is flexible in that it provides the ability to plug in different -Metadata Extractors, Data Protocols, Content Types, -etc. The framework supports parallel file transfers and data -downloads, email-based push data acceptance using IMAP, -SMTP protocols, and the ability to configure “Virtual” remote -directories (based on Metadata such as Date/Time) -from which files can be downloaded. -The critical objects managed by the Push Pull Framework -are: -Retrieval Method - defines the manner in which files are -retrieved from remote sites. It is given a configuration -file, a the Parser for the file, and a FileRetrievalSystem -(which handles all the complexities of multi-threaded -file downloading). There are currently two out-of-thebox -RetrievalMethods: RemoteCrawler and ListRetriever. -RemoteCrawler is a configurable remote site -directory and file regular expression filterable crawler. -ListRetriever will download a given list of file URIs -[5]. -Parser - parses a given configuration file into a Virtual- -FileStructure which is use to filter URIs to download. -Protocol - handles file transfer and communication via -some transfer protocol. Currently implemented Protocols -include: sftp, ftp, http, imaps, file (localhost). -3.6 PCS Task Wrapper -The PCS Task Wrapper framework is responsible for -standardizing the setup, process initiation, execution and -file management tasks surrounding execution of NASA -Product Generation Executives, or PGEs. PGEs codify a -scientific algorithm, some step in the overall scientific process -involved in a mission science workflow. -The PCS Task Wrapper provides a stable operating environment -to the underlying PGE during its execution lifecycle. -If the PGE requires a file, or metadata regarding the -file, the PCS Task Wrapper is responsible for delivering that -information to the PGE in a manner that meets its requirements. -If the PGE requires knowledge of upstream or downstream -PGEs in a sequence of executions, that information -is also made available, and finally if information regarding -disk space, node information such as CPU availability, etc., -is required the PCS Task Wrapper provides this information -to the underlying PGE. After this information is collected, -the PGE is executed and its output Product file and -Metadata generation is managed via the PCS Task Wrapper -framework. The PCS Task Wrapper is responsible for -marshalling output Products and Metadata back to the File -Manager for use in downstream data processing and pedigree. -In support of this, the PCS Task Wrapper leverages -the Crawler Framework to ingest (during pipeline processing) -the output Product files and Metadata produced by the -PGE. -As can be gleaned from the above discussion, the PGE -Task Wrapper is really the unifying bridge between the execution -of a step in the overall processing pipeline, and the -available PCS component services and the information that -they collectively manage. -The critical objects managed by the PCS Task Wrapper -are: -PGETaskInstance - an abstract class which contains a -generalized set of actions usually performed when running -PGEs. Every variable and method is protected, -thus allowing subclasses to easily modify just those -generalized actions which need to be customized for -different PGE. -Pge Config File Builder - builds a PgeConfig object and -set additional Metadata which codifies the information -necessary for orchestrating a PGE through its lifecycle. -The PCS Task Wrapper is based on a simple but -powerful XML syntax which allows a scientist to simply -fill out an xml file to describe the necessary steps -to execute a PGE. -Config File Property Adder - builds the Pge Config file -object and sets custom PGE Metadata. This allows -for a general PgeConfigBuilder with different Config- -FilePropertyAdders for setting PGE specific fields in -the PgeConfig object. -Science Pge Config File Writer - passes a PGE run information -via configuration files. This object allows -for any number of config files in any desired format -to be generated describing PGE input and those files -to be delivered to the PGE. The PCS Task Wrapper -provides existing implementations, including a deFigure -1. Component Interaction Within the PCS -fault XML Stylesheet Language (XSL) Transformation -based SciPgeConfigFileWriter. -Pcs Met File Writer - aids in generating Metadata objects -associated with PGE output products. -4 Experience and Evaluation -We have successfully applied the Process Control System -(PCS) to existing NASA missions: the Orbiting Carbon -Observatory (OCO) mission, and the NPP Sounder PEATE -mission. Both systems involve tasks such as high throughput -job processing, terabyte-scale data management, and -science computing facilities. -4.1 Orbiting Carbon Observatory Mission -On OCO, the mission is using the File Manager to ingest -MODIS, CloudSat and other ancillary data products -for use in the high performance Level 2 Science Algorithm. -To date, OCO has already used the PCS software to process -over four terabytes of Fourier Transform Spectrometer -(FTS) data provided by ground-based instruments located -around the country (e.g., Park falls, Montana, and Darwin, -Australia), and has used the software to support Instrument -Thermal Vacuum (TVAC) testing, processing 100% of all -data taken by the OCO instrument during TVAC. Also, the -PCS supports a science computing facility in which variants -of scientific software can be excursive prior to inclusion in -an operations Pipeline. -4.2 NPP Sounder PEATE Mission -Specifically NPP Sounder PEATE has already used the -File Manager and Workflow Manager to ingest and process -hundreds of gigabytes of IASI data (and is in preparation to -accept CRIMS data). Also on PEATE, the PCS is currently -being used to re-catalog over fifteen million existing science -data products from the NASA AIRS missions TLSCF. -In addition, the Resource Manager will be used on NPP to -support job processing across an eighty-node cluster. -4.3 Further Applications -In addition to the two aforementioned NASA missions, -the PCS framework is being leveraged on reimbursable -work for the National Cancer Institute (NCI)’s Early Detection -Research Network (EDRN) [6]. JPL leads the informatics -efforts on EDRN, and the PCS framework is being -used in the collection, annotation and dissemination of raw -scientific data supporting the early detection of cancer to -scientists across the country. -In the next year, PCS will also be used to support a new -JPL-led NASA mission, the Soil Moisture Active Passive -(SMAP) mission. The science computing facility designs -on OCO and NPP have been used to create an algorithm -testbed for SMAP scientists early in the design phase of the -mission so that software integration risks can be mitigated -during mission development [13]. -5 Conclusions and Future Work -While the norm for earth science missions has been for -each mission to develop their own one-off science data system -from scratch, the continual decrease in mission funding -combined with the exponential increase in mission complexity -(data volume and processing throughput) over the -last decade has made this approach pass´e and risky. It was -clear that the need for a new approach was eminent. -To this end, we have developed a standards-based software -framework to provide common science data system -services that yields the benefits of reuse while remaining -adaptable to address the requirements that are unique to the -customer. This reusable software is centered around the -most basic science data system functions that support file -and metadata management, workflow management, and resource -management. Additional frameworks augment the -core capabilities to provide automation for remote data acquisition, -data ingestion and standard pipeline processing. -This reusable software framework is the Process Control -System (PCS) we have described in this paper. -While the PCS has successfully supported the Orbiting -Carbon Observatory (OCO) and NPP Sounder PEATE -missions, upcoming missions in NASAs Decadal Survey -present additional challenges. The JPL-led Soil Moisture -Active Passive (SMAP) Mission (currently in formulation -phase) will be using the PCS not only for operations, but -also for the algorithm testbed and the science computing facility. -Providing the operational infrastructure to the algorithm -team early in the mission lifecycle will greatly reduce -the cost and risk of development-to-operations for the most -costly and risky aspect of most earth science data systems, -the algorithms. However, this also means that easy integration -of algorithms and dynamic workflow specification -are our current focus for extending the PCS capabilities. -Not far behind SMAP is another JPL-led mission, Deformation, -Ecosystem Structure and Dynamics of Ice (DESDynI) -Mission. The challenges of DESDynI are requiring -us to consider the deployment of PCS components to support -a grid architecture, supporting distributed file management -and processing capabilities supported by centralized -access to a virtual science data system. -Acknowledgements -This effort was supported by the Jet Propulsion Laboratory, -managed by the California Institute of Technology -under a contract with the National Aeronautics and Space -Administration. -References -[1] Open archives initiative, http://www.openarchives.org. -[2] Open grid forum, http://www.ogf.org. -[3] S4pa, http://daac.gsfc.nasa.gov/techlab/s4pa/index.shtml. -[4] Dublin core metadata element set, 1999. -[5] T. Berners-Lee, R. Fielding, and L. Masinter. Uniform resource -identifiers (uri): Generic syntax. Technical Report -RFC 2396, 1998. -[6] D. Crichton, S. Kelly, C. Mattmann, Q. Xiao, J. S. Hughes, -J. Oh, M. Thornquist, D. Johnsey, S. Srivastava, L. Essermann, -and W. Bigbee. A distributed information services -architecture to support biomarker discovery in early detection -of cancer. In e-Science, page 44, 2006. -[7] E. Deelman, J. Blythe, Y. Gil, C. Kesselman, G. Mehta, -S. Patil, M.-H. Su, K. Vahi, and M. Livny. Pegasus: Mapping -Scientific Workflows onto the Grid. 2004. -[8] I. Foster. The anatomy of the grid: Enabling scalable virtual -organizations. pages 6–7, 2001. -[9] I. Foster. Globus toolkit version 4: Software for serviceoriented -systems. pages 2–13. 2005. -[10] J. Frey, T. Tannenbaum, M. Livny, I. Foster, and S. Tuecke. -Condor-g: A computation management agent for multiinstitutional -grids. Cluster Computing, 5(3):237–246, July -2002. -[11] M. J. Litzkow, M. Livny, and M.W. Mutka. Condor-a hunter -of idle workstations. pages 104–111, 1988. -[12] C. Mattmann, D. J. Crichton, N. Medvidovic, and -S. Hughes. A software architecture-based framework for -highly distributed and data intensive scientific applications. -In ICSE, pages 721–730, 2006. -[13] D. Woollard, O. ig Kwoun, T. Bicknell, S. Dunbar, and -K. Leung. A science data system approach for the smap -mission. In IEEE Radar, 2009. -[14] D. Woollard, N. Medvidovic, Y. Gil, and C. A. Mattmann. -Scientific software as workflows: From discovery to distribution. -Software, IEEE, 25(4):37–43, 2008. -[15] J. Yu and R. Buyya. A taxonomy of workflow management -systems for grid computing, Apr 2005. -View publication stats \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..34943152c1de43027dec19842ea368fe9f27c1dd Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/A Reusable Process Control System Framework for the Orbiting Carbon Observatory and NPP Sounder PEATE missions.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-relation.txt index 98281efb2fac7059f9bd7452c63995f31d82f4cf..6d6eda2f3f2f79375a3b058102f0978e4df65528 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-relation.txt @@ -1,109 +1,225 @@ -Catalog and Archive File Management Component -Introduction -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Use Cases -Conclusion -This is the developer guide for the Apache OODT Catalog and Archive Service (CAS) File Manager component, or File Manager for short. Primarily, this guide will explain the File Manager architecture and interfaces, including its tailorable extension points. For information on installation, configuration, and examples, please see our User Guides. - -The remainder of this guide is separated into the following sections: - -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Project Description -The File Manager component is responsible for tracking, ingesting and moving file data and metadata between a client system and a server system. The File Manager is an extensible software component that provides an XML-RPC external interface, and a fully tailorable Java-based API for file management. - -Architecture -In this section, we will describe the architecture of the File Manager, including its constituent components, object model, and key capabilities. - -Components -The major components of the File Manager are the Client and Server, the Repository Manager, the Catalog, the Validation Layer, the Versioner, and the Transferer. The relationship between all of these components are shown in the diagram below: - -File Manager Architecture - -The File Manager Server contains both a Repository that manages products (and the products' location in the archive as specified by Versioner), and a Catalog that validates metadata via the Validation Layer. Transfer of data products from the Client to the Server is the domain of the Transfer and can be initiated at either the Client or the Server. - -Object Model -The critical objects managed by the File Manager include: - -Products - Collections of one or more files, and their associated Metadata. -Metadata - A map of key->multiple values of descriptive information about a Product. See CAS-Metadata for more information on Metadata. -Reference - A pointer to a Product file's (or files') original location, and to its final resting location within the archive constructed by the File Manager. -Product Type - Descriptive information about a Product that includes what type of file URI generation scheme to use, the root repository location for a particular Product, and a description of the Product. -Element - A singular Metadata element, such as "Author", or "Creator". Elements may have additional metadata, in the form of the associated definition and even a corresponding Dublin Core attribute. See CAS-Metadata for more information on Metadata Elements. -Versioner - A URI generation scheme for Product Types that defines the location within the archive (built by the File Manager) where a file belonging to a Product (that belongs to the associated Product Type) should be placed. -Each Product contains 1 or more References, and one Metadata object. Each Product is a member of a single Product Type. The Metadata collected for each Product is defined by a mapping of Product Type->1...* Elements. Each Product Type has an associated Versioner. These relationships are shown in the below figure. - -File Manager Object Model -Key Capabilities -The File manager has been designed with a new of key capabilities in mind. These capabilities include: - -Easy management of different types of Products. The Repository Manager extension point is responsible for managing Product Types, and their associated information. Management of Product Types includes adding new types, deleting and updating existing types, and retrieving Product Type Objects, by their ID or by their name. - -Support for different kinds of back end catalogs. The Catalog extension point allows Product instance metadata and file location information to be stored in different types of back end data stores quite easily. Existing implementations of the Catalog interface include a JDBC based back end database, along with a flat-file index powered by Lucene. - -Management of Product instance information. Management includes adding, deleting and updating product instance information, including file locations (References), along with Product Metadata. It also includes retrieving Metadata and References associated with existing Products as well as obtaining the Products themselves. - -Element management for Metadata. The File Manager's Validation Layer extension point allows for the management of Element policy information in different types of back end stores. For instance, Element policy could be stored in XML files, a Database, or a Metadata Registry. - -Data transfer mechanism interface. By having an extension point for Data Transfer, the File Manager can support different Data Transfer protocols, both local and remote. - -Advanced support for File Repository layouts. The Versioner extension point allows for different File Repository layouts based on Product Types. - -Support for multiple Product structures. The File Manager Client allows for Products to be Flat, or Hierarchical-based. Flat products are collections of singular files that are aggregated together to make a Product. Hierarchical Products are Products that contain collections of directories, and sub-directories, and files. - -Design for scalability. The File Manager uses the popular client-server paradigm, allowing new File Manager servers to be instantiated, as needed, without affecting the File Manager clients, and vice-versa. - -Standard communication protocols. The File Manager uses XML-RPC as its main external interface between the File Manager client and server. XML-RPC, the little brother of SOAP, is fast, extensible, and uses the underlying HTTP protocol for data transfer. - -RSS-based Product syndication. The File Manager web interface allows for the RSS-based syndication of Product feeds based on Product Type. - -Data transfer status tracking. The File Manager tracks all current Product and File transfers and even publishes an RSS-feed of existing transfers. - -This capability set is not exhaustive, and is meant to give the user a feel for what general features are provided by the File Manager. Most likely the user will find that the File Manager provides many other capabilities besides those described here. - -Extension Points -We have constructed the File Manager making use of the factory method pattern to provide multiple extension points for the File Manager. An extension point is an interface within the File Manager that can have many implementations. This is particularly useful when it comes to software component configuration because it allows different implementations of an existing interface to be selected at deployment time. - -The factory method pattern is a creational pattern common to object oriented design. Each File Manager extension point involves the implementation of two interfaces: an extension factory and an extension implementation. At run-time, the File Manager loads a properties file specifies a factory class to use during extension point instantiation. For example, the File Manager may communicate with a database-based Catalog and an XML-based Element Store (called a Validation Layer), or it may use a Lucene-based Catalog and a database-based Validation Layer. -Using extension points, it is fairly simple to support many different types of what are typically referred to as "plug-in architectures." Each of the core extension points for the File Manager is described below: - -Catalog The Catalog extension point is responsible for storing all the instance data for Products, Metadata, and for file References. Additionally, the Catalog provides a query capability for Products. -Data Transfer The Data Transfer extension point allows for the movement of a Product to and from the archive managed by the File Manager component. Different protocols for Data Transfer may include local (disk-based) copy, or remote XML-RPC based transfer across networked machines. -Repository Manager The Repository Manager extension point provides a means for managing all of the policy information (i.e., the Product Types and their associated information) for Products managed by the File Manager. -Validation Layer The Validation Layer extension point allows for the querying of element definitions associated with a particular Product Type. The extension point also maps Product Type to Elements. -Versioning The Versioning extension point allows for the definition of different URI generation schemes that define the final resting location of files for a particular Product. -System The extension point that provides the external interface to the File Manager services. This includes the File Manager server interface, as well as the associated File Manager client interface, that communicates with the server. -Current Extension Point Implementations -There are at least two implementations of all of the aforementioned extension points for the File Manager. Each extension point implementation is detailed in this section. - -Catalog -Data Source based Catalog. An implementation of the Catalog extension point interface that uses a JDBC accessible database backend. -Lucene based Catalog. An implementation of the Catalog extension point interface that uses the Lucene free text index system to store Product instance information. -Data Transfer -Local Data Transfer. An implementation of the Data Transfer interface that uses Apache's commons-io to perform local, disk based filesystem data transfer. This implementation also supports locally accessible Network File System (NFS) disks. -Remote Data Transfer. An implementation of the Data Transfer interface that uses the XML-RPC File Manager client to transfer files to a remote XML-RPC File Manager server. -InPlace Data Transfer. An implementation of the Data Transfer interface that avoids transfering any products -- this can be used in the situation where metadata about a particular product should be recorded, but no physical transfer needs to occur. -Repository Manager -Data Source based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in a JDBC accessible database. -XML based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in an XML file called product-types.xml -Validation Layer -Data Source based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in a JDBC accessible database. -XML based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in 2 XML files called elements.xml and product-type-element-map.xml -System (File Manager client and File Manager server) -XML-RPC based File Manager server. An implementation of the external server interface for the File Manager that uses XML-RPC as the transportation medium. -XML-RPC based File Manager client. An implementation of the client interface for the XML-RPC File Manager server that uses XML-RPC as the transportation medium. -Use Cases -The File Manager was built to support several of the above capabilities outlined in Section 3. In particular there were several use cases that we wanted to support, some of which are described below. - -File Manager Ingest Use Case -The red numbers in the above Figure correspond to a sequence of steps that occurs and a series of interactions between the different File Manager extension points in order to perform the file ingestion activity. In Step 1, a File Manager client is invoked for the ingest operation, which sends Metadata and References for a particular Product to ingest to the File Manager server’s System Interface extension point. The System Interface uses the information about Product Type policy made available by the Repository Manager in order to understand whether or not the product should be transferred, where it’s root repository path should be, and so on. The System Interface then catalogs the file References and Metadata using the Catalog extension point. During this catalog process, the Catalog extension point uses the Validation Layer to determine which Elements should be extracted for the particular Product, based upon its Product Type. After that, Data Transfer is initiated either at the client or server end, and the first step to Data Transfer is using the Product’s associated Versioner to generate final file References. After final file References have been determined, the file data is transferred by the server or by the client, using the Data Transfer extension point. - -Conclusion -The aim of this document is to provide information relevant to developers about the CAS File Manager. Specifically, this document has described the File Manager's architecture, including its constituent components, object model and key capabilities. Additionally, the this document provides an overview of the current implementations of the File Manager's extension points. - -In the Basic User Guide and Advanced User Guide, we will cover topics like installation, configuration, and example uses as well as advanced topics like scaling and other tips and tricks. \ No newline at end of file +catalog and archive file management component introduction project description architecture extension points current extension point implementations use case conclusion this&Catalog&依赖 +its&points& +guide&tailorable extension point&依赖 +guide&File Manager architecture and interface&依赖 +our&Guides& +remainder§ion&依赖 +remainder&guide&AGGREGATION +we&architecture&依赖 +we&constituent component&依赖 +its&components& +we&architecture&依赖 +we&constituent component&依赖 +architecture&File Manager&AGGREGATION +Client and Server&major component&依赖 +Client and Server&File Manager&依赖 +major component&File Manager&AGGREGATION +relationship&diagram&依赖 +Catalog&metada&依赖 +File Manager Server&Repository&依赖 +Catalog&Validation Layer&依赖 +products&location& +File Manager Server&' location&依赖 +Repository&product&依赖 +File Manager Server&archive as&依赖 +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +Transfer&data product&AGGREGATION +Transfer&Transfer&依赖 +domain&Transfer&AGGREGATION +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +collection&one or more file&AGGREGATION +their&Metadata& +A map&key&AGGREGATION +> multiple value&descriptive information&AGGREGATION +See&Metadata&依赖 +See&more information&依赖 +its&location& +File Manager&what type&依赖 +File Manager&file urus generation scheme&依赖 +what type&file urus generation scheme&AGGREGATION +description&Product&AGGREGATION +element&form&依赖 +element&associated definition&依赖 +form&associated definition&AGGREGATION +element&additional metada&依赖 +A URI generation scheme&location&依赖 +A URI generation scheme&built&依赖 +A URI generation scheme&archive (&依赖 +Product&1 or more reference&依赖 +member&single Product Type&AGGREGATION +Product&single Product Type&依赖 +Product&mapping&依赖 +Product&Product Type&依赖 +mapping&Product Type&AGGREGATION +Product Type&associated Versioner&依赖 +relationship&below figure&依赖 +new&key capability&AGGREGATION +Easy management&different type&AGGREGATION +different type&Products&AGGREGATION +their&information& +their&ID& +Management&Product Types&AGGREGATION +Management&new type&依赖 +their&name& +Management&new type&依赖 +different kind&back end catalog&AGGREGATION +Catalog extension point&extension point&GENERALIZATION +Catalog extension point&product instance metada and file location information&依赖 +different type&back end data store&AGGREGATION +implementation&end database&依赖 +implementation&JDBC&依赖 +implementation&end database&依赖 +implementation&JDBC&依赖 +implementation&Catalog interface&AGGREGATION +Management&Product instance information&AGGREGATION +Management&include add , delete and update product instance information&依赖 +Management&file location&依赖 +It&Metadata and reference&依赖 +Manager&point& +different type&back end store&AGGREGATION +management&Element policy information&AGGREGATION +Element policy&instance&依赖 +Element policy&XML file&依赖 +Data Transfer&Transfer&GENERALIZATION +File Manager&different Data Transfer protocol&依赖 +Versioner extension point&different File Repository layout&依赖 +Versioner extension point&extension point&GENERALIZATION +Flat product&singular file&依赖 +collection&singular file&AGGREGATION +Products&directory&依赖 +collection&directory&AGGREGATION +File Manager&popular client-server paradigm&依赖 +File Manager&XML-RPC&依赖 +its&interface& +File Manager&File Manager client and server&依赖 +File Manager&main external interface&依赖 +little brother&SOAP&AGGREGATION +File Manager web interface&RSS-based syndication&依赖 +RSS-based syndication&Product feed&AGGREGATION +datum&status tracking&依赖 +RSS-feed&transfer&AGGREGATION +File Manager¤t Product and File transfer&依赖 +Extension Points We&file manager make use&依赖 +file manager make use&factory method pattern&AGGREGATION +interface&many implementation&依赖 +extension point&File Manager&依赖 +different implementation&interface&AGGREGATION +it&software component configuration&依赖 +it&different implementation&依赖 +it&interface&实现 +File Manager extension point&implementation&依赖 +File Manager extension point&two interface&实现 +implementation&two interface&AGGREGATION +File Manager extension point&extension factory&实现 +File Manager load&factory class&依赖 +File Manager load&factory class&依赖 +File Manager load&run-time&依赖 +File Manager load&run-time&依赖 +File Manager&example&依赖 +File Manager&database-based Catalog&依赖 +it&Lucene-based Catalog&依赖 +it&many different type&实现 +The Data Transfer extension point&Product&依赖 +The Data Transfer extension point&movement&依赖 +The Data Transfer extension point&archive&依赖 +movement&Product&AGGREGATION +Different protocol&local ( disk-based ) copy&依赖 +Different protocol&local ( disk-based ) copy&依赖 +extension point&Product Type&依赖 +extension point&element&依赖 +different URI generation scheme&file&依赖 +different URI generation scheme&final resting location&依赖 +final resting location&file&AGGREGATION +different URI generation scheme&particular Product&依赖 +definition&different URI generation scheme&AGGREGATION +Catalog Data Source&base catalog&依赖 +implementation&Catalog extension point interface&AGGREGATION +implementation&a jdbc accessible database backend&依赖 +lucene base catalog&lucene base catalog&依赖 +implementation&catalog extension point interface&AGGREGATION +implementation&catalog extension point interface&依赖 +lucene base catalog&Lucene free text index system&依赖 +implementation&catalog extension point interface&依赖 +implementation&Data Transfer interface&AGGREGATION +Apache&commons-io& +implementation&locally accessible network file system ( nfs ) disk&依赖 +implementation&XML-RPC File Manager client&依赖 +XML-RPC File Manager client&File Manager client&GENERALIZATION +implementation&XML-RPC File Manager client&依赖 +implementation&datum transfer interface&AGGREGATION +InPlace Data Transfer .&product&依赖 +implementation&product type policy information&依赖 +implementation&product type policy information&依赖 +implementation&repository manager extension point&AGGREGATION +implementation&JDBC accessible database&依赖 +implementation&JDBC accessible database&依赖 +XML file&file&GENERALIZATION +implementation&JDBC accessible database&依赖 +implementation&JDBC accessible database&依赖 +implementation&element policy information&依赖 +implementation&element policy information&依赖 +implementation&validation layer extension point&AGGREGATION +Validation Layer extension point&2 XML file&依赖 +implementation&) xml-rpc&依赖 +Validation Layer extension point&Element policy information&依赖 +implementation&( file manager client&依赖 +implementation&) xml-rpc&依赖 +implementation&Validation Layer extension point&AGGREGATION +implementation&( file manager client&依赖 +File Manager&transportation medium&依赖 +implementation&File Manager&依赖 +File Manager&use xml-rpc&依赖 +implementation&File Manager&依赖 +File Manager&use xml-rpc&依赖 +implementation&File Manager&依赖 +File Manager&transportation medium&依赖 +implementation&external server interface&AGGREGATION +XML-RPC&File Manager client&依赖 +implementation&client interface&AGGREGATION +implementation&XML-RPC File Manager server&依赖 +XML-RPC File Manager server&transportation medium&依赖 +implementation&XML-RPC File Manager server&依赖 +XML-RPC File Manager server&XML-RPC&依赖 +implementation&XML-RPC File Manager server&依赖 +several&above capability&AGGREGATION +we&that&依赖 +manager ingest use case red number&step&依赖 +manager ingest use case red number&step&依赖 +sequence&step&AGGREGATION +series&interaction&AGGREGATION +manager ingest use case red number&sequence&依赖 +manager ingest use case red number&sequence&依赖 +ingest operation&ingest&依赖 +File Manager client&Step 1&依赖 +ingest operation&a particular product&依赖 +File Manager client&ingest operation&依赖 +ingest operation&Metadata and References&依赖 +server&point& +System Interface&information&依赖 +it&path& +System Interface&made&依赖 +System Interface&Product Type policy&依赖 +Metadata&Catalog extension point&依赖 +System Interface&file reference&依赖 +Catalog extension point&catalog process&依赖 +Catalog extension point&Validation Layer&依赖 +its&Type& +first step&’s associated versioner&依赖 +Product&Versioner& +first step&’s associated versioner&依赖 +Data Transfer&client or server end&依赖 +aim&document&AGGREGATION +document&architecture&依赖 +document&constituent components , object model and key capability&依赖 +Manager&architecture& +its&model& +current implementation&extension point&AGGREGATION +overview¤t implementation&AGGREGATION +Manager&points& +we&topic&依赖 +we&topic&依赖 +we&installation , configuration , and example use&依赖 +we&installation , configuration , and example use&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-simEnts.txt deleted file mode 100644 index 98281efb2fac7059f9bd7452c63995f31d82f4cf..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component-simEnts.txt +++ /dev/null @@ -1,109 +0,0 @@ -Catalog and Archive File Management Component -Introduction -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Use Cases -Conclusion -This is the developer guide for the Apache OODT Catalog and Archive Service (CAS) File Manager component, or File Manager for short. Primarily, this guide will explain the File Manager architecture and interfaces, including its tailorable extension points. For information on installation, configuration, and examples, please see our User Guides. - -The remainder of this guide is separated into the following sections: - -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Project Description -The File Manager component is responsible for tracking, ingesting and moving file data and metadata between a client system and a server system. The File Manager is an extensible software component that provides an XML-RPC external interface, and a fully tailorable Java-based API for file management. - -Architecture -In this section, we will describe the architecture of the File Manager, including its constituent components, object model, and key capabilities. - -Components -The major components of the File Manager are the Client and Server, the Repository Manager, the Catalog, the Validation Layer, the Versioner, and the Transferer. The relationship between all of these components are shown in the diagram below: - -File Manager Architecture - -The File Manager Server contains both a Repository that manages products (and the products' location in the archive as specified by Versioner), and a Catalog that validates metadata via the Validation Layer. Transfer of data products from the Client to the Server is the domain of the Transfer and can be initiated at either the Client or the Server. - -Object Model -The critical objects managed by the File Manager include: - -Products - Collections of one or more files, and their associated Metadata. -Metadata - A map of key->multiple values of descriptive information about a Product. See CAS-Metadata for more information on Metadata. -Reference - A pointer to a Product file's (or files') original location, and to its final resting location within the archive constructed by the File Manager. -Product Type - Descriptive information about a Product that includes what type of file URI generation scheme to use, the root repository location for a particular Product, and a description of the Product. -Element - A singular Metadata element, such as "Author", or "Creator". Elements may have additional metadata, in the form of the associated definition and even a corresponding Dublin Core attribute. See CAS-Metadata for more information on Metadata Elements. -Versioner - A URI generation scheme for Product Types that defines the location within the archive (built by the File Manager) where a file belonging to a Product (that belongs to the associated Product Type) should be placed. -Each Product contains 1 or more References, and one Metadata object. Each Product is a member of a single Product Type. The Metadata collected for each Product is defined by a mapping of Product Type->1...* Elements. Each Product Type has an associated Versioner. These relationships are shown in the below figure. - -File Manager Object Model -Key Capabilities -The File manager has been designed with a new of key capabilities in mind. These capabilities include: - -Easy management of different types of Products. The Repository Manager extension point is responsible for managing Product Types, and their associated information. Management of Product Types includes adding new types, deleting and updating existing types, and retrieving Product Type Objects, by their ID or by their name. - -Support for different kinds of back end catalogs. The Catalog extension point allows Product instance metadata and file location information to be stored in different types of back end data stores quite easily. Existing implementations of the Catalog interface include a JDBC based back end database, along with a flat-file index powered by Lucene. - -Management of Product instance information. Management includes adding, deleting and updating product instance information, including file locations (References), along with Product Metadata. It also includes retrieving Metadata and References associated with existing Products as well as obtaining the Products themselves. - -Element management for Metadata. The File Manager's Validation Layer extension point allows for the management of Element policy information in different types of back end stores. For instance, Element policy could be stored in XML files, a Database, or a Metadata Registry. - -Data transfer mechanism interface. By having an extension point for Data Transfer, the File Manager can support different Data Transfer protocols, both local and remote. - -Advanced support for File Repository layouts. The Versioner extension point allows for different File Repository layouts based on Product Types. - -Support for multiple Product structures. The File Manager Client allows for Products to be Flat, or Hierarchical-based. Flat products are collections of singular files that are aggregated together to make a Product. Hierarchical Products are Products that contain collections of directories, and sub-directories, and files. - -Design for scalability. The File Manager uses the popular client-server paradigm, allowing new File Manager servers to be instantiated, as needed, without affecting the File Manager clients, and vice-versa. - -Standard communication protocols. The File Manager uses XML-RPC as its main external interface between the File Manager client and server. XML-RPC, the little brother of SOAP, is fast, extensible, and uses the underlying HTTP protocol for data transfer. - -RSS-based Product syndication. The File Manager web interface allows for the RSS-based syndication of Product feeds based on Product Type. - -Data transfer status tracking. The File Manager tracks all current Product and File transfers and even publishes an RSS-feed of existing transfers. - -This capability set is not exhaustive, and is meant to give the user a feel for what general features are provided by the File Manager. Most likely the user will find that the File Manager provides many other capabilities besides those described here. - -Extension Points -We have constructed the File Manager making use of the factory method pattern to provide multiple extension points for the File Manager. An extension point is an interface within the File Manager that can have many implementations. This is particularly useful when it comes to software component configuration because it allows different implementations of an existing interface to be selected at deployment time. - -The factory method pattern is a creational pattern common to object oriented design. Each File Manager extension point involves the implementation of two interfaces: an extension factory and an extension implementation. At run-time, the File Manager loads a properties file specifies a factory class to use during extension point instantiation. For example, the File Manager may communicate with a database-based Catalog and an XML-based Element Store (called a Validation Layer), or it may use a Lucene-based Catalog and a database-based Validation Layer. -Using extension points, it is fairly simple to support many different types of what are typically referred to as "plug-in architectures." Each of the core extension points for the File Manager is described below: - -Catalog The Catalog extension point is responsible for storing all the instance data for Products, Metadata, and for file References. Additionally, the Catalog provides a query capability for Products. -Data Transfer The Data Transfer extension point allows for the movement of a Product to and from the archive managed by the File Manager component. Different protocols for Data Transfer may include local (disk-based) copy, or remote XML-RPC based transfer across networked machines. -Repository Manager The Repository Manager extension point provides a means for managing all of the policy information (i.e., the Product Types and their associated information) for Products managed by the File Manager. -Validation Layer The Validation Layer extension point allows for the querying of element definitions associated with a particular Product Type. The extension point also maps Product Type to Elements. -Versioning The Versioning extension point allows for the definition of different URI generation schemes that define the final resting location of files for a particular Product. -System The extension point that provides the external interface to the File Manager services. This includes the File Manager server interface, as well as the associated File Manager client interface, that communicates with the server. -Current Extension Point Implementations -There are at least two implementations of all of the aforementioned extension points for the File Manager. Each extension point implementation is detailed in this section. - -Catalog -Data Source based Catalog. An implementation of the Catalog extension point interface that uses a JDBC accessible database backend. -Lucene based Catalog. An implementation of the Catalog extension point interface that uses the Lucene free text index system to store Product instance information. -Data Transfer -Local Data Transfer. An implementation of the Data Transfer interface that uses Apache's commons-io to perform local, disk based filesystem data transfer. This implementation also supports locally accessible Network File System (NFS) disks. -Remote Data Transfer. An implementation of the Data Transfer interface that uses the XML-RPC File Manager client to transfer files to a remote XML-RPC File Manager server. -InPlace Data Transfer. An implementation of the Data Transfer interface that avoids transfering any products -- this can be used in the situation where metadata about a particular product should be recorded, but no physical transfer needs to occur. -Repository Manager -Data Source based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in a JDBC accessible database. -XML based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in an XML file called product-types.xml -Validation Layer -Data Source based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in a JDBC accessible database. -XML based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in 2 XML files called elements.xml and product-type-element-map.xml -System (File Manager client and File Manager server) -XML-RPC based File Manager server. An implementation of the external server interface for the File Manager that uses XML-RPC as the transportation medium. -XML-RPC based File Manager client. An implementation of the client interface for the XML-RPC File Manager server that uses XML-RPC as the transportation medium. -Use Cases -The File Manager was built to support several of the above capabilities outlined in Section 3. In particular there were several use cases that we wanted to support, some of which are described below. - -File Manager Ingest Use Case -The red numbers in the above Figure correspond to a sequence of steps that occurs and a series of interactions between the different File Manager extension points in order to perform the file ingestion activity. In Step 1, a File Manager client is invoked for the ingest operation, which sends Metadata and References for a particular Product to ingest to the File Manager server’s System Interface extension point. The System Interface uses the information about Product Type policy made available by the Repository Manager in order to understand whether or not the product should be transferred, where it’s root repository path should be, and so on. The System Interface then catalogs the file References and Metadata using the Catalog extension point. During this catalog process, the Catalog extension point uses the Validation Layer to determine which Elements should be extracted for the particular Product, based upon its Product Type. After that, Data Transfer is initiated either at the client or server end, and the first step to Data Transfer is using the Product’s associated Versioner to generate final file References. After final file References have been determined, the file data is transferred by the server or by the client, using the Data Transfer extension point. - -Conclusion -The aim of this document is to provide information relevant to developers about the CAS File Manager. Specifically, this document has described the File Manager's architecture, including its constituent components, object model and key capabilities. Additionally, the this document provides an overview of the current implementations of the File Manager's extension points. - -In the Basic User Guide and Advanced User Guide, we will cover topics like installation, configuration, and example uses as well as advanced topics like scaling and other tips and tricks. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..3214e495f6c85d9cd627ada1113d1fba90949502 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Catalog and Archive File Management Component.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-relation.txt index 4d392f092f1e1d48e7df22228826768f1c7e62aa..5e6d1644140bd63e9c95676ad76a2c09661c8a4e 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-relation.txt @@ -1,46 +1,79 @@ - -转至元数据结尾 -由 Tom Barber创建, 最终由 Lewis John McGibbney修改于 三月 23, 2016转至元数据起始 -Goals -Background and strategic fit -Assumptions -Requirements -User interaction and design -Phase 1 -Phase 2 -Questions -Not Doing -Target release 0.13 -Epic -Document status DRAFT -Document owner -Tom Barber - -Designer -Developers -QA -Goals -Allow for OODT filemanager linear scalaing, distribution and seamless querying of multiple filemanagers on separate networks. -Background and strategic fit -To truely fit the "Distributed" aspect of OODT, the file manager component should allow for multiple file managers to be started and they allow for both local and remote querying of the file managers seamlessly. For example, I have an OODT installation in the UK and an OODT installation on a separate site in Australia, it makes more sense from a data transfer and performance perspective to allow those 2 FM's to operate independantly of each other, but allow for querying of both repositories as if they were one, so I could choose to retieve information from the Australia FM. This also allows users to say "give me all Excel files from all my sites", without having to point fm-client at all the different instances. - -Assumptions -Requirements -Mutliple file manager configuration and monitoring via PCS Developers should be able to define multiple file managers in the PCS so they are exposed to services like OPSUI -Query execution location A user using a file manager client, should be able to tell the file manager client it only wants to search the local file manager, or search a specific remote, or search every available file manager for data. -Coping with broken communications The File Manager should be able to cope with remote nodes going offline or becoming unavailable and fail gracefully. -Schema alignment/matching A user wishes to execute a query without knowing the underlying data model, on more than one file manager server. The query should be executed regardless and the relevant content returned. Must Have -User interaction and design -If you use filemgr client then a very simplistic implementation would be to extend the service to allow definitions of multiple local and remote file managers and allow the user to execute a query over each file manager and concat the result. This would be very quick to implement but doesn't scale well or support other services. - -Phase 1 -Currently OPSUI and the PCS platform only allows you to define 1 file manager per opsui instance, but to implement this feature successfully the OPSUI configuration needs to allow for multiple file managers, and when a user looks at the summary page the summary should show a grouping of all the available file managers, whilst also allowing the ability to filter by file manager. - -In summary the following changes will be made: - -Registration of multiple file managers within PCS/OPSUI -Changes made to the OPSUI monitoring pages to reflect multiple filemanagers -Changes made to the Product listing pages to allow display of content from multiple filemanagers and filter by file manager -Additional configuration within filemgr client to allow for lookup of available filemanagers from PCS and query them. -Phase 2 -Phase 2 of the enhancement would involve adding further distributed capabilities to the filemanager(and possibly PCS platform as a whole), by adding in an optional zookeeper configuration that would allow for nodes self registering, graceful handling of nodes disappearing and also leader election and so forth. I feel enhancing OODT with industry standard distributed configuration management that is already widely used in "Big Data" type deployments will help with scalability of the platform and resiliency over distributed locations. \ No newline at end of file +seamless querying&multiple filemanager&AGGREGATION +Designer Developers QA Goals Allow&multiple filemanager&依赖 +Designer Developers QA Goals Allow&seamless querying&依赖 +Designer Developers QA Goals Allow&multiple filemanager&依赖 +Designer Developers QA Goals Allow&separate network&依赖 +Designer Developers QA Goals Allow&oodt filemanager linear scalaing&依赖 +Designer Developers QA Goals Allow&seamless querying&依赖 +Designer Developers QA Goals Allow&oodt filemanager linear scalaing&依赖 +Designer Developers QA Goals Allow&separate network&依赖 +Designer Developers QA Goals Allow&oodt filemanager linear scalaing&依赖 +Designer Developers QA Goals Allow&separate network&依赖 +Designer Developers QA Goals Allow&multiple filemanager&依赖 +Designer Developers QA Goals Allow&seamless querying&依赖 +Designer Developers QA Goals Allow&separate network&依赖 +Designer Developers QA Goals Allow&oodt filemanager linear scalaing&依赖 +Designer Developers QA Goals Allow&multiple filemanager&依赖 +Designer Developers QA Goals Allow&multiple filemanager&依赖 +Designer Developers QA Goals Allow&separate network&依赖 +Designer Developers QA Goals Allow&seamless querying&依赖 +Designer Developers QA Goals Allow&seamless querying&依赖 +Designer Developers QA Goals Allow&oodt filemanager linear scalaing&依赖 +they&local and remote querying&依赖 +" Distributed " aspect&OODT&AGGREGATION +they&file manager&依赖 +local and remote querying&file manager&AGGREGATION +I&UK&依赖 +it&more sense&依赖 +I&UK&依赖 +it&data transfer and performance perspective&依赖 +I&OODT installation&依赖 +I&UK&依赖 +I&OODT installation&依赖 +FM&information& +I&UK&依赖 +I&OODT installation&依赖 +I&OODT installation&依赖 +my&sites& +opsui query execution location a user&file manager client&依赖 +they&opsui query execution location a user&依赖 +they&file manager client&依赖 +they&service&依赖 +it&local file manager&依赖 +opsui query execution location a user&file manager client&依赖 +File Manager&remote node&依赖 +Schema&query&依赖 +Schema&query&依赖 +you&filemgr client&依赖 +simplistic implementation&definition&依赖 +definition&multiple local and remote file manager&AGGREGATION +simplistic implementation&service&依赖 +simplistic implementation&multiple local and remote file manager&依赖 +grouping&available file manager&AGGREGATION +OPSUI configuration&feature&实现 +summary&available file manager&依赖 +summary&grouping&依赖 +PCS platform&you&依赖 +PCS platform&opsui instance&依赖 +PCS platform&1 file manager&依赖 +user&summary page&依赖 +Registration&content&依赖 +lookup&available filemanager&AGGREGATION +Registration&content&依赖 +Registration&display&依赖 +Registration&content&依赖 +Registration&multiple file manager&AGGREGATION +Registration&display&依赖 +Registration&content&依赖 +display&content&AGGREGATION +Registration&display&依赖 +Registration&display&依赖 +Phase 2 Phase 2&enhancement&AGGREGATION +Phase 2 Phase 2&further distributed capability&依赖 +graceful handling&node&AGGREGATION +Phase 2 Phase 2&further distributed capability&依赖 +scalability&platform and resiliency&AGGREGATION +type deployment&distributed location&依赖 +I&OODT&依赖 +type deployment&scalability&依赖 +type deployment&platform and resiliency&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-simEnts.txt deleted file mode 100644 index 4d392f092f1e1d48e7df22228826768f1c7e62aa..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation-simEnts.txt +++ /dev/null @@ -1,46 +0,0 @@ - -转至元数据结尾 -由 Tom Barber创建, 最终由 Lewis John McGibbney修改于 三月 23, 2016转至元数据起始 -Goals -Background and strategic fit -Assumptions -Requirements -User interaction and design -Phase 1 -Phase 2 -Questions -Not Doing -Target release 0.13 -Epic -Document status DRAFT -Document owner -Tom Barber - -Designer -Developers -QA -Goals -Allow for OODT filemanager linear scalaing, distribution and seamless querying of multiple filemanagers on separate networks. -Background and strategic fit -To truely fit the "Distributed" aspect of OODT, the file manager component should allow for multiple file managers to be started and they allow for both local and remote querying of the file managers seamlessly. For example, I have an OODT installation in the UK and an OODT installation on a separate site in Australia, it makes more sense from a data transfer and performance perspective to allow those 2 FM's to operate independantly of each other, but allow for querying of both repositories as if they were one, so I could choose to retieve information from the Australia FM. This also allows users to say "give me all Excel files from all my sites", without having to point fm-client at all the different instances. - -Assumptions -Requirements -Mutliple file manager configuration and monitoring via PCS Developers should be able to define multiple file managers in the PCS so they are exposed to services like OPSUI -Query execution location A user using a file manager client, should be able to tell the file manager client it only wants to search the local file manager, or search a specific remote, or search every available file manager for data. -Coping with broken communications The File Manager should be able to cope with remote nodes going offline or becoming unavailable and fail gracefully. -Schema alignment/matching A user wishes to execute a query without knowing the underlying data model, on more than one file manager server. The query should be executed regardless and the relevant content returned. Must Have -User interaction and design -If you use filemgr client then a very simplistic implementation would be to extend the service to allow definitions of multiple local and remote file managers and allow the user to execute a query over each file manager and concat the result. This would be very quick to implement but doesn't scale well or support other services. - -Phase 1 -Currently OPSUI and the PCS platform only allows you to define 1 file manager per opsui instance, but to implement this feature successfully the OPSUI configuration needs to allow for multiple file managers, and when a user looks at the summary page the summary should show a grouping of all the available file managers, whilst also allowing the ability to filter by file manager. - -In summary the following changes will be made: - -Registration of multiple file managers within PCS/OPSUI -Changes made to the OPSUI monitoring pages to reflect multiple filemanagers -Changes made to the Product listing pages to allow display of content from multiple filemanagers and filter by file manager -Additional configuration within filemgr client to allow for lookup of available filemanagers from PCS and query them. -Phase 2 -Phase 2 of the enhancement would involve adding further distributed capabilities to the filemanager(and possibly PCS platform as a whole), by adding in an optional zookeeper configuration that would allow for nodes self registering, graceful handling of nodes disappearing and also leader election and so forth. I feel enhancing OODT with industry standard distributed configuration management that is already widely used in "Big Data" type deployments will help with scalability of the platform and resiliency over distributed locations. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..8b1dec8525dd265cb9474b47539802cd949cca9e Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/File Manager Scale Out Planning - OODT - Apache Software Foundation.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-relation.txt index 64874cd0ab7710c92ba8961cff86e020932c7839..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-relation.txt @@ -1,26 +0,0 @@ -Catalog and Archive File Management Component 0.12 API -Packages -Package Description -org.apache.oodt.cas.filemgr.catalog -org.apache.oodt.cas.filemgr.catalog.solr -org.apache.oodt.cas.filemgr.cli.action -org.apache.oodt.cas.filemgr.datatransfer -org.apache.oodt.cas.filemgr.exceptions -org.apache.oodt.cas.filemgr.ingest -org.apache.oodt.cas.filemgr.metadata -org.apache.oodt.cas.filemgr.metadata.extractors -org.apache.oodt.cas.filemgr.metadata.extractors.examples -org.apache.oodt.cas.filemgr.repository -org.apache.oodt.cas.filemgr.structs -org.apache.oodt.cas.filemgr.structs.exceptions -org.apache.oodt.cas.filemgr.structs.query -org.apache.oodt.cas.filemgr.structs.query.conv -org.apache.oodt.cas.filemgr.structs.query.filter -org.apache.oodt.cas.filemgr.structs.type -org.apache.oodt.cas.filemgr.structs.type.examples -org.apache.oodt.cas.filemgr.system -org.apache.oodt.cas.filemgr.system.auth -org.apache.oodt.cas.filemgr.tools -org.apache.oodt.cas.filemgr.util -org.apache.oodt.cas.filemgr.validation -org.apache.oodt.cas.filemgr.versioning \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-simEnts.txt deleted file mode 100644 index 64874cd0ab7710c92ba8961cff86e020932c7839..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester-simEnts.txt +++ /dev/null @@ -1,26 +0,0 @@ -Catalog and Archive File Management Component 0.12 API -Packages -Package Description -org.apache.oodt.cas.filemgr.catalog -org.apache.oodt.cas.filemgr.catalog.solr -org.apache.oodt.cas.filemgr.cli.action -org.apache.oodt.cas.filemgr.datatransfer -org.apache.oodt.cas.filemgr.exceptions -org.apache.oodt.cas.filemgr.ingest -org.apache.oodt.cas.filemgr.metadata -org.apache.oodt.cas.filemgr.metadata.extractors -org.apache.oodt.cas.filemgr.metadata.extractors.examples -org.apache.oodt.cas.filemgr.repository -org.apache.oodt.cas.filemgr.structs -org.apache.oodt.cas.filemgr.structs.exceptions -org.apache.oodt.cas.filemgr.structs.query -org.apache.oodt.cas.filemgr.structs.query.conv -org.apache.oodt.cas.filemgr.structs.query.filter -org.apache.oodt.cas.filemgr.structs.type -org.apache.oodt.cas.filemgr.structs.type.examples -org.apache.oodt.cas.filemgr.system -org.apache.oodt.cas.filemgr.system.auth -org.apache.oodt.cas.filemgr.tools -org.apache.oodt.cas.filemgr.util -org.apache.oodt.cas.filemgr.validation -org.apache.oodt.cas.filemgr.versioning \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..f8af3037fb4043ef401be08ca2007838c77a1e74 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Interface Ingester.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-relation.txt index 67416259f0fde4ffff53c54c973b52b030fce5e7..94de539222ff6a6e340f34e813b419d0bbc43832 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-relation.txt @@ -1,141 +1,664 @@ -HAL Id:hal-01513774 -https://hal.inria.fr/hal-01513774 -Submitted on25Apr2017 -HAL is amulti-disciplinaryopenaccess -archiveforthedepositanddisseminationofsci- -entificresearchdocuments,whethertheyarepub- -lished ornot.Thedocumentsmaycomefrom -teachingandresearchinstitutionsinFranceor -abroad, orfrompublicorprivateresearchcenters. -L’archiveouvertepluridisciplinaire HAL, est -destinée audépôtetàladiffusiondedocuments -scientifiquesdeniveaurecherche,publiésounon, -émanantdesétablissementsd’enseignementetde -recherchefrançaisouétrangers,deslaboratoires -publics ouprivés. -Distributed underaCreativeCommons Attribution| 4.0InternationalLicense -Mahasen: DistributedStorageResourceBroker -K. Perera,T.Kishanthan,H.Perera,D.Madola,MalakaWalpola,Srinath -Perera -Tocitethisversion: -K. Perera,T.Kishanthan,H.Perera,D.Madola,MalakaWalpola,etal..Mahasen:Distributed -Storage ResourceBroker.10thInternationalConferenceonNetworkandParallelComputing(NPC), -Sep 2013,Guiyang,China.pp.380-392,￿10.1007/978-3-642-40820-5_32￿.￿hal-01513774￿ -Mahasen: Distributed Storage Resource Broker -K.D.A.K.S.Perera1, T Kishanthan1, H.A.S.Perera1, D.T.H.V.Madola1, Malaka Walpola1, Srinath Perera2 -1 Computer Science and Engineering Department, University Of Moratuwa, Sri Lanka. {shelanrc, kshanth2101, ashansa.perera, hirunimadola, malaka.uom}@gmail.com -2 WSO2 Lanka, No 59, Flower Road, Colombo 07, Sri Lanka -srinath@wso2.com -Abstract. Modern day systems are facing an avalanche of data, and they are being forced to handle more and more data intensive use cases. These data comes in many forms and shapes: Sensors (RFID, Near Field Communication, Weather Sensors), transaction logs, Web, social networks etc. As an example, weather sensors across the world generate a large amount of data throughout the year. Handling these and similar data require scalable, efficient, reliable and very large storages with support for efficient metadata based searching. This paper present Mahasen, a highly scalable storage for high volume data intensive applications built on top of a peer-to-peer layer. In addition to scalable storage, Mahasen also supports efficient searching, built on top of the Distributed Hash table (DHT) -1 Introduction -Currently United States collects weather data from many sources like Doppler readers deployed across the country, aircrafts, mobile towers and Balloons etc. These sensors keep generating a sizable amount of data. Processing them efficiently as needed is pushing our understanding about large-scale data processing to its limits. -Among many challenges data poses, a prominent one is storing the data and indexing them so that scientist and researchers can come and ask for specific type of data collected at a given time and in a given region. For example, a scientist may want to search for all Automated Weather data items collected in Bloomington area in June 15 between 8am-12pm. -Although we have presented meteorology as an example, there are many similar use cases. For instance, Sky server [1] is one of the best examples that illustrate the use case of large data generation. This project expects to collect 40 terabytes of data in five years. In its data collection, the photometric catalog is expected to contain about 500 distinct attributes for each of one hundred million galaxies, one hundred million stars, and one million quasars. Similarly many sciences, analytic processing organizations, data mining use cases etc., would want to store large amount of data and process them later in a selective manner. These systems often store data as files -and there have been several efforts to build large scale Metadata catalogs [2][3] and storage solutions[4][5] to support storing and searching those data items. One such example is AMGA metadata catalog [6] which was an effort to build replication and distribution mechanism for metadata catalogs. -As we discuss in the related work section, most of the metadata catalog implementations use centralized architectures and therefore have limited scalability unlike Mahasen. For example, Nirvana Storage [7] has a centralized metadata catalog which only supports scalability through vendor’s mechanism such as Oracle Real Application clusters. XML Metadata Concept catalog (XMC Cat) [8] is another centralized metadata catalog which stores hierarchical rich metadata. This paper presents Mahasen, a scalable metadata catalog and storage server built on top of a P2P technology. Further, it is built by distributing an open source centralized Data registry (WSO2 Registry). -Mahasen (Distributed Storage Resource Broker) is a Data Grid Management System (DGMS) that can manage a large volume of distributed data. It targets high volume data intensive applications. The architecture of Mahasen has been designed to present a single global logical namespace across all the stored data, and it maintains a metadata structure which can be used to search files based on its’ attributes. It is a network of storage servers that plays the dual purpose of a metadata catalog and a storage server. Mahasen will solve the huge data storage problem and fault tolerance in data intensive computing through aggregating low cost hardware while having both metadata and actual resources distributed without single point of failure. Metadata management will ensure the capability of searching files based on attributes of the stored resources. Mahasen has a metadata catalog, which is highly distributed and well scalable. The metadata layer ensures fault tolerance by keeping replicas of metadata. -The rest of the paper is organized as follows. The next section will discuss the related work in Metadata catalogs and Storage servers while comparing and contrasting them with Mahasen. The following section will discuss Mahasen architecture. The next section will present the performance evaluation of Mahasen. Finally the discussion section discusses limitations, other potential solutions and directions. -2 Related Work -2.1 Nirvana Storage -Nirvana SRB [7] is a middleware system that federates large heterogeneous data resources distributed across a network. The ability to access, manage, search and organize data across the entire SRB Federation is provided via a Global Namespace. MCAT is the centralized metadata repository which maintains two types of records – system- and user-metadata. Scalability of MCAT is achieved using database vendor’s mechanisms [9], hence limited by Relational DB scalability Limits. -Storage/Replication. The stored resources are divided as Physical resources, Logical resources and Cluster resources. Replication of resources across multiple servers ensures the availability and recoverability of resources during failovers. -Retrieve. Data stream routing is handled by SRB and TCP/IP, making the data transfer process transparent to the users.. -Search. Searching is done based on metadata attributes which are extracted and managed by the SRB. -Add/Update. Data can be added in two ways: Registration and Ingestion. Registration does not transfer any data but only creates a pointer to the data in MCAT. Ingestion is similar to registration but also transfers the data to an SRB storage resource. -Delete. If a file shadow object is used as a data object to ingest a file resource to SRB then file will be removed from MCAT but not from the physical location. -2.2 Apache OODT -OODT[10] is a middleware system for metadata that provides transparent access to the resources. It facilitates functionalities such as store, retrieve, search and analyze distributed data, objects and databases jointly. OODT provides a product service and profile service which manage data and metadata respectively. -Storage/Replication. OODT stores data product in a file-based storage in a distributed manner. They classify storage into three categories: on-line, near-line or off-line storage. -Retrieve. When OODT receives a request for retrieving a file, it issues a profile query to a product server that helps in resolving resources that could provide data. The response will include the target product server address in the form of a URI. The OODT issues a product query based on the profile query results to get the data, and it will actually retrieve data from the product server in a MIME-compliant format. -Search. OODT uses the profile server and the product server for searching the metadata and retrieve the products, and it has multiple of each type of server. OODT is based on client server architecture and it promotes REST-style architectural pattern for search and retrieve data. The profile or a subset of profile is returned for retrieval. -Add/Update. OODT provide data management including manage files and folders with the implementation of javax.sql.datasource interface. -Delete. The file management component of a Catalog and Archive Service support the delete of resource files and metadata through the implementation of javax.sql.datasource interface. -2.3 WSO2 Governance Registry -WSO2 Governance Registry [11] is a repository that allows users to store resources in a tree-structured manner, just like with a file system. However, unlike a file system, users may annotate resources using their custom properties, and also WSO2 Registry has built in metadata management features like tagging, associating resources. -However, WSO2 registry is backed by a Relational Database system, and it uses database features to store data, metadata, to manage them, and to search. Hence it has a centralized architecture. Mahasen extends that architecture to a distributed architecture. -Replication. There is no inbuilt mechanism to do the replication of resources in WSO2 registry. -Search. The WSO2 registry provides two types of searches. One is searching for a resource with their name, metadata etc., and it is implemented using underline relational database system. The second one is searching the content of resources, and implemented using Lucene [12]. The second search is only applicable to resources with textual content. -Add/Update. Adding of resources to registry can be done in two ways. First one is adding via the web interface provided by the registry. When adding a new resource, it is also possible to add additional metadata such as tags, properties of name value pairs, which later will be useful to search for that resource. The other way to add resources is by writing your own way by extending the registry API and exposing it as a web service. -The major limitation with registry, when storing resources, is the amount of memory available. Since it uses the java heap memory to buffer the resources before storing them, large files cannot be stored as the available memory is only limited to few hundred of megabytes. -2.4 Hadoop Distributed File System -Apache Hadoop Distributed File System is (HDFS)[13] is a file system designed to run on commodity hardware. HDFS has a master slave architecture that consists of a single NameNode as master and number of DataNodes. The NameNode is responsible of regulating access to files by client and managing the namespace of the file system. Generally DataNodes are deployed one per node in the cluster, and is responsible of managing storage attached to that node. -Storage / Replication. Hadoop supports hierarchical file organization where user can create directories and store files. It splits the file in to chunks with the default size of 64MB and stores them as sequence of blocks, and those blocks are stored in underlying file system of DataNodes. Those blocks are replicated for fault tolerance and the block size and the replication factor of data are configurable. -Retrieve. Applications that run on HDFS need streaming access to their data sets. Data nodes will be responsible for the read requests that issued from a user to retrieve data from the system. -Search. Hadoop Distributed File System does not provide a comprehensive search for users or applications, and it just fulfill the requirement of a distributed file system by supporting to locate the physical location of the file using the system specific metadata. -Add/Update. Writing to HDFS should be done by creating a new file and writing data to it. Hadoop addresses a single writer multiple readers’ model. Once the data is written and file is closed, one cannot remove or alter data. Data can be added to the file by reopening the file and appending new data. -Delete. When a file is deleted by a user or from an application, the particular resource is not immediately removed from HDFS. The resource will be renamed and copied in to /trash directory giving the possibility to restore as long as it remains in the trash. -Mahasen’s main differentiation from above systems comes from its scalability. It can scale significantly than Nirvana Storage that depends on relational databases to scale the system, since the Mahasen metadata layer is natively distributed using a DHT.WSO2 Registry provides the clustering as the scalability option, but it is not optimized for large file transfers and storing as it uses an ATOM based resource transfers. Furthermore, Mahasen provides users a comprehensive metadata model for managing the distributed resources they stored with user-defined metadata, unlike the HDFS, which only focuses on creating a Distributed file system. Further Mahasen's metadata layer is natively distributed and fault tolerant while HDFS has a single name node which can make fault tolerant only with an active passive failover configuration. -3 High Level Architecture -3.1 Mahasen High Level Architecture -As shown by Figure 1, Mahasen consists of several storage nodes which are connected as peers to a logical ring via FreePastry. Each node consists of a registry to store -metadata and a file system to store physical file parts. Once connected to the ring each node contributes to the metadata space as well as file storage capacity, scaling the system dynamically with new node additions. Nodes use underline DHT (FreePastry) routing protocol to communicate efficiently with each other. -Fig. 1. Mahasen High Level Architecture -Mahasen uses a WSO2 registry and the file system in each node and DHT based architecture is used to connect the nodes to a one unit. -Mahasen has a distributed metadata layer that stores data about the distributed files in Mahasen peer to peer network. The metadata catalog is used to broker the stored resources in the network and to assist the user to locate the files in Mahasen distributed environment abstracting the metadata management from the user. -Mahasen stores two main types of metadata, which are system-defined metadata and user-defined (descriptive) metadata. System defined metadata is mainly used for server side resource handling. File name, file size, stored node IPs of file are examples of the system-defined metadata. User defined metadata is used to provide users the searching capability on those metadata. User can add tags and properties (name, value pairs) to the files that are uploaded. -Fig. 2. Metadata Object Structure of Mahasen -When a file is uploaded connecting to a Mahasen node the file will be temporarily saved in that node. Then the node will act as the master node and split the file into pre-defined sized chunks and the split parts are stored in a selected set of the neighborhood nodes of master node through parallel transfer. Then the metadata object created by master node will be stored with replicas using PAST storage implementation of Free pastry. We have rewritten PAST node’s persistent storage such that the data will be stored in the WSO registry in that node. -After storing the metadata, the nodes that received file parts act as worker nodes and replicate their file parts in parallel according to the replicate request issued by the master node. Each worker node will update the metadata object with stored locations of the file parts which were replicated after replicating their file parts using the capability of concurrent access to metadata objects, and Mahasen handles them using the locking system provided by the lock manager of DHT. -User can request to download a file from any Mahasen node and the node will first generate the resource ID for the requested and retrieve the metadata object. Then it extracts the locations of Mahasen nodes that contain the file parts from the metadata object and retrieve those parts to the local machine. The parts will be merged to create the original file after retrieving all the parts and the file will be streamed to the user. -Deletion can be performed with a single command across a heterogeneous storage system. When a delete request for a file is issued, by following the same method of retrieving the file, Mahasen finds nodes that store parts of the file and deletes them. Finally the metadata object will also be deleted with replicas -When user needs to update the user-defined metadata, the node that receives the update request retrieves the metadata object for the file from the DHT, updates it, and stores it back in the DHT. -. Using this model, Mahasen has built a complete decentralized metadata system that handles metadata management in a highly scalable and efficient manner. -Mahasen keeps replicas of both actual files and metadata objects. The main purpose of keeping replicas is for fault tolerance and failover recovery. We ensure the high availability of metadata while ensuring the scalability using free pastry’s underlying DHT. -3.2 Mahasen Search -When the amount of data in the system grows, the complexity of the search increases. Mahasen builds a distributed data structure using the underlying DHT, which can improve the performance of different search options that Mahasen supports. -The resources in Mahasen are associated with metadata and for each tag or property in system, we maintain an index pointing to all resources which have that tag or property. This is implemented as a TreeMap [16] and the property trees are stored in the DHT which handles replicas of it. -Fig. 3. A Property Tree Stored in Mahasen Memory Storage -When a user sends a search request, Mahasen extracts the requested search and initiate the execution of relevant search method. Then the resource IDs of the files which match with the given input are retrieved from the relevant property tree. Extracting the relevant resource IDs are done as follow. -Users can send search requests to any Mahasen node, and when a node receives a search request, Mahasen takes the property name given by the client and generates the property tree ID for that property. If the current node has the index for the property, it receives matching resource IDs for that property and sends them to the client. If not, the node acts as a master node and gets the node handles of the nodes which are having the specific property tree and routs Mahasen search messages with the required parameters to the node handles. Then those node handles will get the relevant resource IDs from the property trees in their memory storage and send back to the master node. -The property values in the property tree are sorted, so that if the search is a range based search, we can simply take the sub map between the initial and final property values and retrieve the set of resource IDs mapped to each of the node in the sub tree. Since these resource IDs represents the files having the given property values, Mahasen can look up for the metadata objects with those resource IDs and extract the file names to present to for the user. The operation of extracting the file names for the resource IDs has a high cost than extracting the matching resource IDs for the given search query. -Complete Data Structure built for Mahasen can support property based search, range based search, tag based search and Boolean operations for the properties such as AND operation and OR operation. The advanced search provided by Mahasen is capable of providing the search based on set of different properties and tags. -Mahasen Search utilizes the continuation model support by FreePastry in results retrieving and transferring. Therefore when a search request is issued, the application sends requests to look up node handles, which contain the particular TreeMap object to request results. Then the application will collect the first result incoming and resume action from the previous execution point. -3.3 File Handling -File Transfer. Mahasen is a network of storage nodes and users will be given a client which is the Mahasen Client to access and transfer files to the network. The Mahasen Client that is built using the Apache HttpClient [17] uses HTTP methods for transferring files to the network. First the client initiates a connection with one of the node in the network. An authenticated client is capable of uploading downloading, deleting, updating or searching for the files in the network. The File content will be added as an entity to the HTTP POST method and streamed to the target address. The receiving end will read the file stream and write it to the repository. -Replica Management. To achieve fault tolerance and failover recovery, the file will be split into a set of predefined chunks and each part will be replicated and stored in different nodes according to predefined replication factor. The placement of replicas is a critical part which affects the reliability and performance of the system. The purpose of having a policy for placement of replicas is for data reliability, availability, and network bandwidth utilization. The current policy of Mahasen is to store the replicated files in leaf nodes set to the initial node. The selection of nodes in the leaf set will be calculated using cost evaluation function which focus on the distance of the node. -After successfully transferring the file to the initial node, the client will be notified about the status of the file transfer and initial node will then replicate and transfer the file to other nodes. The number of copies kept for a file is called the replication factor of that file and will be decided by the Mahasen system. -File Splitting and Parallel transfer. Mahasen storage network is designed to store large files reliably across distributed nodes. When storing the file it will be split into blocks of fixed size and these blocks will be replicated across the network for fault tolerance. The transferring of replicated file blocks will be done in parallel to other nodes in order to utilize the bandwidth and to save time. -When focusing on the retrieval of a file by using the metadata object the system will then select a node which is closest to the reader node and download the blocks to the client. Downloading of file blocks will also be done in parallel and then the blocks will be merged to create the complete file. -3.4 Mahasen API -Mahasen provides a complete API to perform CRUD operations and search. Users can develop external clients apart from the default client Mahasen provides and integrate with existing systems to perform resource management and search operations. -3 Performance Analysis -The Mahasen System Scalability was tested by running a system with M nodes and N parallel clients. Here the value for M was 1, 6, 12, 18, 24 and N was 1, 5, 10, 15, 20. Each client carried out upload, download, delete and search operations for 10 times and the average was taken. The system configuration that was used in this test are, Two machines with Intel(R) Xeon(R) CPU E5-2403 1.80GHz 4 Core machines having 24GB RAM and One machine with Intel(R) Xeon(R) CPU E5-2470 2.30GHz 8 Core machines having 63GB RAM. Following Figures (from 4 to 7) depicts the results of this test. In the upload test, 500MB size files were used by each client. . -Fig. 4. Upload test results -In the results it is observed that when the number of client increases, the upload time is also increasing. We believe that this is due to the network congestion and background processes of data replication across nodes. When the number of nodes increased to 18 or 24, a reduction in upload time were observed. This was an expected behaviour, because the node which client selects to upload, distributes replica management task for other nodes in the p2p ring. -Fig. 5. Download test results -When download files using Mahasen client, it is observed that with the increase of number of client, the single node setup has a significant growth in the download time. In the performance test, a single node was chosen to send the client request while it coordinates the file transfer from other nodes in the setup. Therefore when there are multiple nodes in the system you can download file parts from other available nodes, which reduces the download time. -Fig. 6. Delete test results -When Mahasen performs a Delete on a resource, it involves 3 operations such as deleting metadata, deleting entries from search index, and deleting the physical file. When more nodes are in the system, each node can participate in deleting its own files in parallel, making the system more scalable and efficient. -Fig 7. Search test results -Search results illustrate that Mahasen can perform well even with more nodes added to the system. Usually single node should have the lowest possible time as it does not have to search across the p2p ring. But with multiple nodes, it has to aggregate results and present it to the client. This can be observed from the figure that, when more clients are in the system, results tend to converge into a lower value due to caching as we requested search operation through the same node. -3 Discussion and future work -Mahasen provides a highly scalable metadata structure with its peer-to-peer architecture in the metadata catalog. Unlike the existing metadata catalogs that use centralized architecture, Mahasen distributes metadata across the nodes in the system with the replication making the overall system scalable and fault tolerant. -Mahasen keeps replicas of both metadata objects and property trees as well. The DHT of FreePastry is used to store these objects in the system which provides easy access of them. Keeping replicas of metadata objects and property tree objects do not cost as much as keeping replicas of actual files which are very large in size compared to metadata and property tree objects. By having these objects with replicas in the system, Mahasen has been able to ensure the correct functioning of many of the Mahasen operations even in the conditions like node failures. -An important contribution of Mahasen is developing a distributed indexing structure on top of the DHT for searching data products using different properties associated with data products. Since Mahasen needed to support range based queries, we evaluated earlier effort to build such index structures. Skip Tree Graph [18] was one of the best candidates we selected for search assisting data structure, which can efficiently support range based queries over a DHT. Since we had different properties and data structure had to grow in two dimensions, one in number of properties and the other one in number of entries for one property we were forced to create different DHTs for different properties. Therefore we needed to evaluate a much less complex -solution since maintaining different DHTs could have been very expensive in terms of resources. -When the system scales up with the large number of nodes, it will be more costly to issue a search operation on the available raw metadata stored. Therefore Mahasen developed a combined data structure with DHT and TreeMap as explained earlier. -When a Mahasen node fails, and it is detected by the existing nodes in the network, Mahasen replicates all the metadata objects and the property tree objects which were in the failed node to the existing Mahasen node reading them from other replicas. Mahasen helps in preserving the availability of metadata objects and property tree objects by maintaining the replication factor of them a constant. -Current Mahasen design has several limitations, which we plan to handle as future works. Currently Mahasen stores each property indexes in one Mahasen node and assumes that it will fit within the memory of that node. This may not be major concern for simple cases, and even NoSQL storages like Cassandra makes similar assumptions. Dividing the property tree into parts and storing them in different nodes when it is larger than a given size can solve this problem. We can predefine the maximum size of a part that will be residing in one node. -Another challenge is that search based multiple properties where at least one is a common property would force Mahasen to join large data sets, and one potential solution is to negotiate the size of data sets before start the data merging. -To summarize, Mahasen project builds a scalable storage solution by making a group of existing open source registries work as a one unit. It provides a one logical global namespace, and users may talk to any node of the group and perform any operations. -Mahasen connects nodes (registries) using PAST, a storage overlay implemented on top of Pastry DHT algorithm. Furthermore, Mahasen builds a distributed indexing structure on top of DHT to support property-based search of data items. -A user can benefit from the Web Service API provided and effectively utilize for batch processing of file uploading task through a custom client or basic client provided by Mahasen. -References -1. Alexander, S., Szalay, Peter, Z., Kunszt, Ani Thakar, Jim Gray, Don Slutz, and Robert, J., Brunner.: Designing and Mining Multi-Terabyte Astronomy Archives.: The Sloan Digital Sky Survey. In: SIGMOD ’00 Proceedings of the 2000 ACM SIGMOD international conference on Management of data (2000) -2. Chaitanya Baru, Reagan Moore, Arcot Rajasekar, Michael Wan.:The SDSC Storage Resource Broker (1998) -3. Reagan, W., Moore.: Managing Large Distributed Data Sets using the Storage Resource Broker (2010) -4. G., DeCandia, D., Hastorun, and M., Jampani.: Dynamo.: Amazon’s Highly Available Key-value Store (2010) -5. Ghemawat, S.-T., Leun, and H., Gobioff.: The Google File System. -6. B., K., Nuno Santos.: Distributed Metadata with the AMGA Metadata Catalog. -7. Nirvana Storage - Home of the Storage Resource Broker (SRB®), http://www.nirvanastorage.com/index.php?module=htmlpages&func=display&pid=1 (2011) -8. XML Metadata Concept Catalog (XMC Cat), Data to Insight Center, Indiana University Pervasive Technology Institute, http://d2i.indiana.edu/xmccat. -9. Nirvana Performance, http://www.nirvanastorage.com/index.php?module=htmlpages&func=display&pid=54. -10. ApacheTM OODT, http://oodt.apache.org/ (2011) -11. WSO2 Governance Registry - lean.enterprise.middleware - open source SOA | WSO2, http://wso2.com/products/governance-registry/ (2011) -12. Apache Lucene - Overview, http://lucene.apache.org/java/docs/index.html. -13. HDFS Architecture Guide, http://hadoop.apache.org/docs/r1.0.4/hdfs_design.html (2011) -14. Pastry - A scalable, decentralized, self-organizing and fault-tolerant substrate for peer-to-peer applications, http://www.freepastry.org/. -15. P., Druschel and A., Rowstron.: PAST: A large-scale, persistent peer-to-peer storage utility. In: HotOS VIII, Schoss Elmau, Germany (2001) -16. TreeMap (Java 2 Platform SE 5.0), http://download.oracle.com/javase/1.5.0/docs/api/java/util/TreeMap.html (2011) -17. HttpClient - HttpComponents HttpClient Overview, http://hc.apache.org/httpcomponents-client-ga/ (2011) -18. Alejandra Gonz´alez Beltr´an, Paul Sage and Peter Milligan.: Skip Tree Graph: a Distributed and Balanced Search Tree for Peer-to-Peer Networks. \ No newline at end of file +L’archiveouvertepluridisciplinaire HAL&deslaboratoire public oupriv�s&依赖 +L’archiveouvertepluridisciplinaire HAL&deslaboratoire public oupriv�s&依赖 +L’archiveouvertepluridisciplinaire HAL&deslaboratoire public oupriv�s&依赖 +Modern day system&avalanche&依赖 +avalanche&datum&AGGREGATION +Modern day system&datum&依赖 +datum&many forms and shape&依赖 +weather sensor&large amount&依赖 +weather sensor&datum&依赖 +weather sensor&year&依赖 +weather sensor&datum&依赖 +weather sensor&large amount&依赖 +weather sensor&large amount&依赖 +weather sensor&year&依赖 +weather sensor&datum&依赖 +weather sensor&year&依赖 +weather sensor&year&依赖 +large amount&datum&AGGREGATION +weather sensor&large amount&依赖 +weather sensor&datum&依赖 +similar datum&scalable , efficient , reliable and very large storage&依赖 +similar datum&scalable , efficient , reliable and very large storage&依赖 +similar datum&efficient metada&依赖 +paper&built&依赖 +paper&high volume datum intensive application&依赖 +paper&high volume datum intensive application&依赖 +paper&built&依赖 +paper&present mahasen&依赖 +paper&built&依赖 +top&peer-to-peer layer&AGGREGATION +paper&present mahasen&依赖 +paper&high volume datum intensive application&依赖 +paper&present mahasen&依赖 +Mahasen&weather datum&依赖 +top&distribute hash table ( dht ) 1 introduction currently united states&AGGREGATION +sizable amount&datum&AGGREGATION +sensor&datum&依赖 +sensor&sizable amount&依赖 +Processing&understanding&依赖 +its&limits& +our&understanding& +Processing&large-scale data process&依赖 +Processing&large-scale data process&依赖 +Processing&understanding&依赖 +prominent one&many challenge datum&依赖 +scientist and researcher&specific type&依赖 +specific type&datum&AGGREGATION +scientist and researcher&datum&依赖 +prominent one&datum&依赖 +scientist&Automated Weather data item&依赖 +scientist&8am-12pm&依赖 +scientist&example&依赖 +we&example&依赖 +we&meteorology&依赖 +Sky server [ 1 ]&instance&依赖 +best example&use case&依赖 +best example&large data generation&依赖 +one&best example&AGGREGATION +use case&large data generation&AGGREGATION +Sky server [ 1 ]&best example&依赖 +40 terabyte&datum&AGGREGATION +project&datum&依赖 +project&40 terabyte&依赖 +its&collection& +similarly many science&large amount&依赖 +similarly many science&large amount&依赖 +similarly many science&datum&依赖 +similarly many science&large amount&依赖 +similarly many science&datum&依赖 +similarly many science&datum&依赖 +system&file&依赖 +system&datum&依赖 +[ 2 ] [ 3 ] and storage solution&4 ] [ 5 ]&依赖 +we&related work section&依赖 +most¢ralized architecture&依赖 +most&metadata catalog implementation&AGGREGATION +most¢ralized architecture&实现 +vendor&mechanism& +centralized metadata catalog&’s mechanism&依赖 +Nirvana Storage [ 7 ]¢ralized metadata catalog&依赖 +centralized metadata catalog&scalability&依赖 +centralized metadata catalog&Oracle Real Application cluster&依赖 +Nirvana Storage [ 7 ]&example&依赖 +store&which&依赖 +store&hierarchical rich metada&依赖 +paper&scalable metadata catalog and storage server&依赖 +top&P2P technology&AGGREGATION +paper&Mahasen&依赖 +storage server&server&GENERALIZATION +paper&built&依赖 +a datum grid management system ( dgms )&distributed datum&依赖 +a datum grid management system ( dgms )&large volume&依赖 +large volume&distributed datum&AGGREGATION +It&high volume data intensive application&依赖 +architecture&Mahasen&AGGREGATION +its&attributes& +it&metadata structure&依赖 +It&storage server&依赖 +dual purpose&metadata catalog&AGGREGATION +network&dual purpose&依赖 +network&storage server&AGGREGATION +network&metadata catalog&依赖 +Mahasen&huge data storage problem and fault tolerance&依赖 +Mahasen&data intensive compute&依赖 +single point&failure&AGGREGATION +Metadata management&search file&依赖 +Metadata management&capability&依赖 +attribute&stored resource&AGGREGATION +Mahasen&metadata catalog&依赖 +metadata layer&fault tolerance&依赖 +metadata layer&replica&依赖 +metadata layer&metada&依赖 +replica&metada&AGGREGATION +rest&paper&AGGREGATION +rest&follow&依赖 +next section&related work&依赖 +next section&Metadata catalog&依赖 +section&Mahasen architecture&依赖 +Mahasen architecture&architecture&GENERALIZATION +performance evaluation&Mahasen&AGGREGATION +next section&Mahasen&依赖 +next section&performance evaluation&依赖 +discussion section§ion&GENERALIZATION +discussion section&limitation&依赖 +middleware system&large heterogeneous data resource&依赖 +centralized metadata repository&two type&依赖 +two type&records – system&AGGREGATION +centralized metadata repository&records – system&依赖 +vendor&mechanisms& +Scalability&MCAT&AGGREGATION +stored resource&Physical resource&依赖 +Replication&availability and recoverability&依赖 +Replication&resource&依赖 +Replication&resource&依赖 +Replication&availability and recoverability&依赖 +Replication&availability and recoverability&依赖 +Replication&resource&AGGREGATION +Replication&availability and recoverability&依赖 +Replication&resource&依赖 +Replication&availability and recoverability&依赖 +Replication&availability and recoverability&依赖 +Replication&resource&依赖 +Replication&resource&依赖 +Replication&resource&依赖 +availability and recoverability&resource&AGGREGATION +data transfer process&user&依赖 +datum&two way&依赖 +datum&Registration and Ingestion&依赖 +Registration&datum&依赖 +Ingestion®istration&依赖 +file resource&resource&GENERALIZATION +srb then file&MCAT&依赖 +a datum object&file resource&依赖 +2.2 Apache OODT OODT [ 10 ]&metada&依赖 +functionality&distribute datum , object and database&依赖 +functionality&distribute datum , object and database&依赖 +product service and profile service&data and metada&依赖 +OODT&file-based storage&依赖 +OODT&distributed manner&依赖 +OODT&data product&依赖 +They&three category&依赖 +They&on-line , near-line or off-line storage&依赖 +They&storage&依赖 +OODT&request&依赖 +it&profile query&依赖 +it&product server&依赖 +OODT&file&依赖 +product server&server&GENERALIZATION +form&URI&AGGREGATION +target product server address&form&AGGREGATION +response&URI&依赖 +it&datum&依赖 +it&product server&依赖 +it&product server&依赖 +oodt issue&product query&依赖 +oodt issue&product query&依赖 +it&datum&依赖 +oodt issue&product query&依赖 +it&server&依赖 +type&server&AGGREGATION +multiple&type&AGGREGATION +OODT&profile server&依赖 +profile server&server&GENERALIZATION +it&multiple&依赖 +it&type&依赖 +it&REST-style architectural pattern&依赖 +OODT&client server architecture&依赖 +it&search&依赖 +profile&retrieval&依赖 +profile&profile&AGGREGATION +implementation&javax.sql.datasource interface&AGGREGATION +file management component&resource files and metada&依赖 +file management component&delete&实现 +file management component&resource files and metada&依赖 +file management component&resource files and metada&实现 +file management component&resource files and metada&实现 +file management component&resource files and metada&实现 +file management component&delete&实现 +file management component&resource files and metada&实现 +file management component&delete&实现 +file management component&delete&实现 +file management component&delete&依赖 +file management component&Catalog and Archive Service&AGGREGATION +file management component&delete&依赖 +delete&resource files and metada&AGGREGATION +file system&system&GENERALIZATION +javax.sql.datasource interface&resource&依赖 +javax.sql.datasource interface&user&依赖 +WSO2 Registry&metadata management feature&依赖 +user&resource&依赖 +their&properties& +user&custom property&依赖 +user&custom property&依赖 +user&resource&依赖 +WSO2 Registry&resource&依赖 +WSO2 registry&Relational Database system&依赖 +it&database feature&依赖 +datum , metada&them&依赖 +it¢ralized architecture&依赖 +Mahasen&architecture&依赖 +Mahasen&distributed architecture&依赖 +distributed architecture&architecture&GENERALIZATION +replication&resource&AGGREGATION +two type&search&AGGREGATION +One&resource&实现 +One&metadata etc.&实现 +One&name&实现 +their&name& +second one&resource&依赖 +content&resource&AGGREGATION +second one&content&依赖 +second search&resource&依赖 +second search&textual content&依赖 +textual content&content&GENERALIZATION +it&property&依赖 +property&name value pair&AGGREGATION +it&name value pair&依赖 +it&tag&依赖 +it&additional metada&依赖 +your&way& +other way&own way&依赖 +other way&own way&依赖 +major limitation&memory&依赖 +amount&memory&AGGREGATION +major limitation&memory&依赖 +available memory&megabyte&依赖 +it&java heap memory&依赖 +few hundred&megabyte&AGGREGATION +hdf&master slave architecture&依赖 +master and number&DataNodes&AGGREGATION +namespace&file system&AGGREGATION +user&directory and store file&依赖 +Hadoop&hierarchical file organization&依赖 +sequence&block&AGGREGATION +block&file system&依赖 +file system&DataNodes&AGGREGATION +It&file&依赖 +block&DataNodes&依赖 +It&chunk&依赖 +default size&64MB&AGGREGATION +block&fault tolerance&依赖 +replication factor&datum&AGGREGATION +application&streaming access&依赖 +application&data set&依赖 +their&sets& +Data node&read request&依赖 +requirement&distributed file system&AGGREGATION +file&system specific metada&依赖 +physical location&system specific metada&AGGREGATION +file&using&依赖 +it&distributed file system&依赖 +it&requirement&依赖 +hadoop address&single writer multiple readers ’ model&依赖 +one&datum&依赖 +datum&file&依赖 +file&user&依赖 +particular resource&hdf&依赖 +resource&/ trash directory&依赖 +it&trash&依赖 +resource&possibility&依赖 +its&scalability& +Mahasen&differentiation& +it&large file transfer&依赖 +scalability&system&依赖 +it&an atom base resource transfer&依赖 +It&Nirvana Storage&依赖 +they&user-defined metada&依赖 +Mahasen&layer& +hdf&single name node&依赖 +single name node&active passive failover configuration&依赖 +single name node&fault tolerant&依赖 +Mahasen&several storage node&依赖 +node&metada&依赖 +node&a registry&依赖 +node&physical file part&依赖 +node&system&依赖 +node&dht ( freepastry ) route protocol&依赖 +node&communicate&依赖 +Mahasen High Level Architecture Mahasen&WSO2 registry&依赖 +Mahasen&peer&依赖 +peer network&network&GENERALIZATION +stores datum&peer network&依赖 +stores datum&peer network&依赖 +Mahasen&distributed metadata layer&依赖 +stores datum&peer network&依赖 +two main type&metada&AGGREGATION +mahasen store&metada&依赖 +mahasen store&two main type&依赖 +System defined metada&server side resource handling&依赖 +store node ip&file&AGGREGATION +example&system-defined metada&AGGREGATION +User&tags and property ( name&依赖 +User&value pair&依赖 +User&( name&依赖 +User&tags and property ( name&依赖 +User&value pair&依赖 +User&( name&依赖 +Metadata Object Structure&Mahasen&AGGREGATION +Mahasen node&node&GENERALIZATION +Metadata Object Structure&node&依赖 +master node&node&GENERALIZATION +selected set&neighborhood node&AGGREGATION +neighborhood node&master node&AGGREGATION +split part&selected set&依赖 +split part&master node&依赖 +split part&neighborhood node&依赖 +node&master node&依赖 +split part¶llel transfer&依赖 +metadata object&Free pastry&实现 +metadata object&PAST storage implementation&依赖 +metadata object&replica&依赖 +PAST storage implementation&Free pastry&AGGREGATION +node&storage& +node&file part&依赖 +their&parts& +node&worker node&依赖 +worker node&file part&依赖 +them&system&依赖 +stored location&file part&AGGREGATION +capability&concurrent access&AGGREGATION +worker node&stored location&依赖 +worker node&node&GENERALIZATION +lock manager&DHT&AGGREGATION +worker node&metadata object&依赖 +User&Mahasen node&依赖 +node&resource ID&依赖 +node&request and&依赖 +User&file&依赖 +location&metadata object&依赖 +it&location&依赖 +location&Mahasen node&AGGREGATION +location&file part&依赖 +it&Mahasen node&依赖 +file&user&依赖 +Deletion&heterogeneous storage system&依赖 +Deletion&single command&依赖 +node&file&依赖 +node&part&依赖 +part&file&AGGREGATION +Mahasen&node&依赖 +update request&request&GENERALIZATION +node&DHT&依赖 +metadata object&replica&依赖 +node&metadata object&依赖 +user&user-defined metada&依赖 +node&update request&依赖 +node&file&依赖 +Mahasen&complete decentralized metadata system&依赖 +complete decentralized metadata system&scalable and efficient manner&依赖 +complete decentralized metadata system&metadata management&依赖 +replica&both actual file and metada object&AGGREGATION +Mahasen&both actual file and metada object&依赖 +Mahasen&replica&依赖 +main purpose&replica&AGGREGATION +high availability&metada&AGGREGATION +We&high availability&依赖 +We&metada&依赖 +pastry&DHT& +amount&datum&AGGREGATION +3.2 Mahasen&complexity&依赖 +3.2 Mahasen&search increase&依赖 +complexity&search increase&AGGREGATION +3.2 Mahasen&Mahasen&GENERALIZATION +Mahasen&that&依赖 +Mahasen&a distribute datum structure&依赖 +DHT&different search option&依赖 +DHT&performance&依赖 +Mahasen&using&依赖 +performance&different search option&AGGREGATION +we&pointing&依赖 +resource&tag or property&依赖 +we&index&依赖 +resource&metada&依赖 +replica&it&AGGREGATION +a treemap [ 16 ]&a treemap [ 16 ]&依赖 +DHT&replica&依赖 +DHT&it&依赖 +property tree&DHT&依赖 +mahasen extract&requested search&依赖 +mahasen extract&requested search&依赖 +search request&request&GENERALIZATION +execution&relevant search method&AGGREGATION +user&search request&依赖 +resource id&file&AGGREGATION +resource id&relevant property tree&依赖 +user&search request&依赖 +user&Mahasen node&依赖 +Mahasen&property name&依赖 +property name&name&GENERALIZATION +node&search request&依赖 +Mahasen&given&依赖 +it&resource id&依赖 +it&property&依赖 +current node&index&依赖 +current node&property&依赖 +node handle&node&AGGREGATION +node&specific property tree&依赖 +node&master node&依赖 +node handle&relevant resource id&依赖 +their&storage& +node handle&memory storage&依赖 +memory storage&storage&GENERALIZATION +node handle&property tree&依赖 +we&sub map&依赖 +we&sub map&依赖 +set&resource id&AGGREGATION +resource id&given property value&依赖 +resource id&file&依赖 +operation&resource id&依赖 +operation&resource id&依赖 +operation&resource id&依赖 +operation&given search query&依赖 +operation&given search query&依赖 +operation&high cost&依赖 +operation&given search query&依赖 +operation&high cost&依赖 +operation&high cost&依赖 +Complete Data Structure&property base search&依赖 +Complete Data Structure&property base search&依赖 +set&different property and tag&AGGREGATION +Mahasen Search&continuation model support&依赖 +Mahasen Search&FreePastry&依赖 +Mahasen Search&continuation model support&依赖 +Mahasen Search&FreePastry&依赖 +application&request&依赖 +node handle&request result&依赖 +application&node handle&依赖 +application&first result&依赖 +network&storage node and user&AGGREGATION +Mahasen&storage node and user&依赖 +Mahasen Client&HTTP method&依赖 +client&connection&依赖 +one&node&AGGREGATION +client&node&依赖 +client&network&依赖 +client&connection&依赖 +File content&entity&依赖 +File content&HTTP POST method&依赖 +end&file stream&依赖 +file&set&依赖 +file&predefined replication factor&依赖 +file&predefined chunk&依赖 +set&predefined chunk&AGGREGATION +reliability and performance&system&AGGREGATION +critical part&system&依赖 +critical part&reliability and performance&依赖 +placement&replica&AGGREGATION +current policy&replicated file&依赖 +current policy&set&依赖 +current policy&leaf node&依赖 +current policy&replicated file&依赖 +current policy&set&依赖 +current policy&leaf node&依赖 +current policy&Mahasen&AGGREGATION +distance&node&AGGREGATION +selection&node&AGGREGATION +client&status&依赖 +client&file transfer&依赖 +initial node&other node&依赖 +initial node&file&依赖 +status&file transfer&AGGREGATION +number©&AGGREGATION +replication factor&file&AGGREGATION +block&fault tolerance&依赖 +it&block&依赖 +it&fixed size&依赖 +block&network&依赖 +block&fixed size&AGGREGATION +system&node&依赖 +retrieval&file&AGGREGATION +user&default client&依赖 +user&external client&依赖 +default client&client&GENERALIZATION +client&upload , download , delete and search operation&依赖 +result&follow figure&依赖 +result&test&AGGREGATION +result&follow figure&依赖 +500MB size file&client&依赖 +500MB size file&upload test&依赖 +number&client increase&AGGREGATION +network congestion and background process&data replication&AGGREGATION +number&18 or 24&依赖 +number&node&AGGREGATION +number&18 or 24&依赖 +client&upload&依赖 +node&other node&依赖 +node&replica management task&依赖 +node&p2p ring&依赖 +node&replica management task&依赖 +node&other node&依赖 +node&p2p ring&依赖 +client&which&依赖 +increase&number&AGGREGATION +download file&Mahasen client&依赖 +number&client&AGGREGATION +Mahasen client&client&GENERALIZATION +single node setup&significant growth&依赖 +it&other node&依赖 +it&setup&依赖 +it&file transfer&依赖 +other available node&download time&依赖 +multiple node&system&依赖 +you&file part&依赖 +you&other available node&依赖 +Mahasen&Delete&依赖 +it&3 operation&依赖 +it&metada&依赖 +its&files& +single node&lowest possible time&依赖 +it&client&依赖 +it&aggregate result&依赖 +it&it&依赖 +we&same node&依赖 +we&same node&依赖 +search operation&operation&GENERALIZATION +we&search operation&依赖 +we&search operation&依赖 +its&architecture& +Mahasen&overall system scalable and fault tolerant&依赖 +Mahasen&metadata catalog&依赖 +Mahasen&system&依赖 +Mahasen&replication&依赖 +Mahasen&metada&依赖 +Mahasen&metada&依赖 +metadata catalog¢ralized architecture&依赖 +replica&both metadata object and property tree&AGGREGATION +Mahasen&both metadata object and property tree&依赖 +easy access&them&AGGREGATION +DHT&FreePastry&AGGREGATION +replica&actual file&依赖 +replica&metadata objects and property tree object&AGGREGATION +replica&much as keeping replica&依赖 +replica&actual file&依赖 +much as keeping replica&actual file&AGGREGATION +replica&much as keeping replica&依赖 +Mahasen&Mahasen operation&依赖 +Mahasen&many&依赖 +many&Mahasen operation&AGGREGATION +Mahasen&many&依赖 +Mahasen&correct functioning&依赖 +Mahasen&correct functioning&依赖 +correct functioning&many&AGGREGATION +Mahasen&correct functioning&依赖 +Mahasen&Mahasen operation&依赖 +Mahasen&Mahasen operation&依赖 +Mahasen&many&依赖 +important contribution&DHT&依赖 +top&DHT&AGGREGATION +important contribution&DHT&依赖 +important contribution&distributed indexing structure&依赖 +important contribution&distributed indexing structure&依赖 +important contribution&top&依赖 +important contribution&top&依赖 +important contribution&Mahasen&AGGREGATION +Mahasen&range based query&依赖 +we&earlier effort&依赖 +we&such index structure&依赖 +data structure&range based query&依赖 +we&search&依赖 +data structure&DHT&依赖 +Skip Tree Graph [ 18 ]&best candidate&依赖 +we&data structure&依赖 +one&best candidate&AGGREGATION +we&different property and datum structure&依赖 +number&entry&AGGREGATION +number&property&AGGREGATION +we&less complex solution&依赖 +we&expensive&依赖 +term&resource&AGGREGATION +it&available raw metada&依赖 +it&search operation&依赖 +large number&node&AGGREGATION +Mahasen&DHT and TreeMap&依赖 +Mahasen&combined data structure&依赖 +Mahasen&metadata object&依赖 +it&node&依赖 +network&them&依赖 +it&network&依赖 +availability&metadata object&AGGREGATION +replication factor&them&AGGREGATION +we&which&依赖 +Current Mahasen design&several limitation&依赖 +we&future work&依赖 +mahasen store&one Mahasen node&依赖 +mahasen store&property index&依赖 +memory&node&AGGREGATION +mahasen store&property index&依赖 +mahasen store&one Mahasen node&依赖 +NoSQL storage&similar assumption&依赖 +NoSQL storage&similar assumption&依赖 +We&maximum size&依赖 +We&part&依赖 +maximum size&part&AGGREGATION +one potential solution&data set&依赖 +one potential solution&size&依赖 +challenge&Mahasen&依赖 +size&data set&AGGREGATION +group&open source registry&AGGREGATION +Mahasen project&scalable storage solution&依赖 +Mahasen project&project&GENERALIZATION +user&group&依赖 +user&node&依赖 +node&group&AGGREGATION +node ( registry&storage overlay&依赖 +top&Pastry DHT algorithm&AGGREGATION +node ( registry&PAST&依赖 +property-based search&data item&AGGREGATION +Mahasen&distributed indexing structure&依赖 +Mahasen&top&依赖 +Mahasen&DHT&依赖 +user&Web Service API&依赖 +batch processing&file uploading task&AGGREGATION +SIGMOD ’00 Proceedings&2000 ACM SIGMOD international conference 2000 ) 2&AGGREGATION +2000 ACM SIGMOD international conference 2000 ) 2&datum&AGGREGATION +Management&datum&AGGREGATION +Amazon&Store& +Home and storage resource broker ( srb � )&storage resource broker ( srb � )&AGGREGATION +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&HotOS VIII&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&Schoss Elmau&依赖 +germany ( 2001 ) 16&germany ( 2001 ) 16&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-simEnts.txt deleted file mode 100644 index 67416259f0fde4ffff53c54c973b52b030fce5e7..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker-simEnts.txt +++ /dev/null @@ -1,141 +0,0 @@ -HAL Id:hal-01513774 -https://hal.inria.fr/hal-01513774 -Submitted on25Apr2017 -HAL is amulti-disciplinaryopenaccess -archiveforthedepositanddisseminationofsci- -entificresearchdocuments,whethertheyarepub- -lished ornot.Thedocumentsmaycomefrom -teachingandresearchinstitutionsinFranceor -abroad, orfrompublicorprivateresearchcenters. -L’archiveouvertepluridisciplinaire HAL, est -destinée audépôtetàladiffusiondedocuments -scientifiquesdeniveaurecherche,publiésounon, -émanantdesétablissementsd’enseignementetde -recherchefrançaisouétrangers,deslaboratoires -publics ouprivés. -Distributed underaCreativeCommons Attribution| 4.0InternationalLicense -Mahasen: DistributedStorageResourceBroker -K. Perera,T.Kishanthan,H.Perera,D.Madola,MalakaWalpola,Srinath -Perera -Tocitethisversion: -K. Perera,T.Kishanthan,H.Perera,D.Madola,MalakaWalpola,etal..Mahasen:Distributed -Storage ResourceBroker.10thInternationalConferenceonNetworkandParallelComputing(NPC), -Sep 2013,Guiyang,China.pp.380-392,￿10.1007/978-3-642-40820-5_32￿.￿hal-01513774￿ -Mahasen: Distributed Storage Resource Broker -K.D.A.K.S.Perera1, T Kishanthan1, H.A.S.Perera1, D.T.H.V.Madola1, Malaka Walpola1, Srinath Perera2 -1 Computer Science and Engineering Department, University Of Moratuwa, Sri Lanka. {shelanrc, kshanth2101, ashansa.perera, hirunimadola, malaka.uom}@gmail.com -2 WSO2 Lanka, No 59, Flower Road, Colombo 07, Sri Lanka -srinath@wso2.com -Abstract. Modern day systems are facing an avalanche of data, and they are being forced to handle more and more data intensive use cases. These data comes in many forms and shapes: Sensors (RFID, Near Field Communication, Weather Sensors), transaction logs, Web, social networks etc. As an example, weather sensors across the world generate a large amount of data throughout the year. Handling these and similar data require scalable, efficient, reliable and very large storages with support for efficient metadata based searching. This paper present Mahasen, a highly scalable storage for high volume data intensive applications built on top of a peer-to-peer layer. In addition to scalable storage, Mahasen also supports efficient searching, built on top of the Distributed Hash table (DHT) -1 Introduction -Currently United States collects weather data from many sources like Doppler readers deployed across the country, aircrafts, mobile towers and Balloons etc. These sensors keep generating a sizable amount of data. Processing them efficiently as needed is pushing our understanding about large-scale data processing to its limits. -Among many challenges data poses, a prominent one is storing the data and indexing them so that scientist and researchers can come and ask for specific type of data collected at a given time and in a given region. For example, a scientist may want to search for all Automated Weather data items collected in Bloomington area in June 15 between 8am-12pm. -Although we have presented meteorology as an example, there are many similar use cases. For instance, Sky server [1] is one of the best examples that illustrate the use case of large data generation. This project expects to collect 40 terabytes of data in five years. In its data collection, the photometric catalog is expected to contain about 500 distinct attributes for each of one hundred million galaxies, one hundred million stars, and one million quasars. Similarly many sciences, analytic processing organizations, data mining use cases etc., would want to store large amount of data and process them later in a selective manner. These systems often store data as files -and there have been several efforts to build large scale Metadata catalogs [2][3] and storage solutions[4][5] to support storing and searching those data items. One such example is AMGA metadata catalog [6] which was an effort to build replication and distribution mechanism for metadata catalogs. -As we discuss in the related work section, most of the metadata catalog implementations use centralized architectures and therefore have limited scalability unlike Mahasen. For example, Nirvana Storage [7] has a centralized metadata catalog which only supports scalability through vendor’s mechanism such as Oracle Real Application clusters. XML Metadata Concept catalog (XMC Cat) [8] is another centralized metadata catalog which stores hierarchical rich metadata. This paper presents Mahasen, a scalable metadata catalog and storage server built on top of a P2P technology. Further, it is built by distributing an open source centralized Data registry (WSO2 Registry). -Mahasen (Distributed Storage Resource Broker) is a Data Grid Management System (DGMS) that can manage a large volume of distributed data. It targets high volume data intensive applications. The architecture of Mahasen has been designed to present a single global logical namespace across all the stored data, and it maintains a metadata structure which can be used to search files based on its’ attributes. It is a network of storage servers that plays the dual purpose of a metadata catalog and a storage server. Mahasen will solve the huge data storage problem and fault tolerance in data intensive computing through aggregating low cost hardware while having both metadata and actual resources distributed without single point of failure. Metadata management will ensure the capability of searching files based on attributes of the stored resources. Mahasen has a metadata catalog, which is highly distributed and well scalable. The metadata layer ensures fault tolerance by keeping replicas of metadata. -The rest of the paper is organized as follows. The next section will discuss the related work in Metadata catalogs and Storage servers while comparing and contrasting them with Mahasen. The following section will discuss Mahasen architecture. The next section will present the performance evaluation of Mahasen. Finally the discussion section discusses limitations, other potential solutions and directions. -2 Related Work -2.1 Nirvana Storage -Nirvana SRB [7] is a middleware system that federates large heterogeneous data resources distributed across a network. The ability to access, manage, search and organize data across the entire SRB Federation is provided via a Global Namespace. MCAT is the centralized metadata repository which maintains two types of records – system- and user-metadata. Scalability of MCAT is achieved using database vendor’s mechanisms [9], hence limited by Relational DB scalability Limits. -Storage/Replication. The stored resources are divided as Physical resources, Logical resources and Cluster resources. Replication of resources across multiple servers ensures the availability and recoverability of resources during failovers. -Retrieve. Data stream routing is handled by SRB and TCP/IP, making the data transfer process transparent to the users.. -Search. Searching is done based on metadata attributes which are extracted and managed by the SRB. -Add/Update. Data can be added in two ways: Registration and Ingestion. Registration does not transfer any data but only creates a pointer to the data in MCAT. Ingestion is similar to registration but also transfers the data to an SRB storage resource. -Delete. If a file shadow object is used as a data object to ingest a file resource to SRB then file will be removed from MCAT but not from the physical location. -2.2 Apache OODT -OODT[10] is a middleware system for metadata that provides transparent access to the resources. It facilitates functionalities such as store, retrieve, search and analyze distributed data, objects and databases jointly. OODT provides a product service and profile service which manage data and metadata respectively. -Storage/Replication. OODT stores data product in a file-based storage in a distributed manner. They classify storage into three categories: on-line, near-line or off-line storage. -Retrieve. When OODT receives a request for retrieving a file, it issues a profile query to a product server that helps in resolving resources that could provide data. The response will include the target product server address in the form of a URI. The OODT issues a product query based on the profile query results to get the data, and it will actually retrieve data from the product server in a MIME-compliant format. -Search. OODT uses the profile server and the product server for searching the metadata and retrieve the products, and it has multiple of each type of server. OODT is based on client server architecture and it promotes REST-style architectural pattern for search and retrieve data. The profile or a subset of profile is returned for retrieval. -Add/Update. OODT provide data management including manage files and folders with the implementation of javax.sql.datasource interface. -Delete. The file management component of a Catalog and Archive Service support the delete of resource files and metadata through the implementation of javax.sql.datasource interface. -2.3 WSO2 Governance Registry -WSO2 Governance Registry [11] is a repository that allows users to store resources in a tree-structured manner, just like with a file system. However, unlike a file system, users may annotate resources using their custom properties, and also WSO2 Registry has built in metadata management features like tagging, associating resources. -However, WSO2 registry is backed by a Relational Database system, and it uses database features to store data, metadata, to manage them, and to search. Hence it has a centralized architecture. Mahasen extends that architecture to a distributed architecture. -Replication. There is no inbuilt mechanism to do the replication of resources in WSO2 registry. -Search. The WSO2 registry provides two types of searches. One is searching for a resource with their name, metadata etc., and it is implemented using underline relational database system. The second one is searching the content of resources, and implemented using Lucene [12]. The second search is only applicable to resources with textual content. -Add/Update. Adding of resources to registry can be done in two ways. First one is adding via the web interface provided by the registry. When adding a new resource, it is also possible to add additional metadata such as tags, properties of name value pairs, which later will be useful to search for that resource. The other way to add resources is by writing your own way by extending the registry API and exposing it as a web service. -The major limitation with registry, when storing resources, is the amount of memory available. Since it uses the java heap memory to buffer the resources before storing them, large files cannot be stored as the available memory is only limited to few hundred of megabytes. -2.4 Hadoop Distributed File System -Apache Hadoop Distributed File System is (HDFS)[13] is a file system designed to run on commodity hardware. HDFS has a master slave architecture that consists of a single NameNode as master and number of DataNodes. The NameNode is responsible of regulating access to files by client and managing the namespace of the file system. Generally DataNodes are deployed one per node in the cluster, and is responsible of managing storage attached to that node. -Storage / Replication. Hadoop supports hierarchical file organization where user can create directories and store files. It splits the file in to chunks with the default size of 64MB and stores them as sequence of blocks, and those blocks are stored in underlying file system of DataNodes. Those blocks are replicated for fault tolerance and the block size and the replication factor of data are configurable. -Retrieve. Applications that run on HDFS need streaming access to their data sets. Data nodes will be responsible for the read requests that issued from a user to retrieve data from the system. -Search. Hadoop Distributed File System does not provide a comprehensive search for users or applications, and it just fulfill the requirement of a distributed file system by supporting to locate the physical location of the file using the system specific metadata. -Add/Update. Writing to HDFS should be done by creating a new file and writing data to it. Hadoop addresses a single writer multiple readers’ model. Once the data is written and file is closed, one cannot remove or alter data. Data can be added to the file by reopening the file and appending new data. -Delete. When a file is deleted by a user or from an application, the particular resource is not immediately removed from HDFS. The resource will be renamed and copied in to /trash directory giving the possibility to restore as long as it remains in the trash. -Mahasen’s main differentiation from above systems comes from its scalability. It can scale significantly than Nirvana Storage that depends on relational databases to scale the system, since the Mahasen metadata layer is natively distributed using a DHT.WSO2 Registry provides the clustering as the scalability option, but it is not optimized for large file transfers and storing as it uses an ATOM based resource transfers. Furthermore, Mahasen provides users a comprehensive metadata model for managing the distributed resources they stored with user-defined metadata, unlike the HDFS, which only focuses on creating a Distributed file system. Further Mahasen's metadata layer is natively distributed and fault tolerant while HDFS has a single name node which can make fault tolerant only with an active passive failover configuration. -3 High Level Architecture -3.1 Mahasen High Level Architecture -As shown by Figure 1, Mahasen consists of several storage nodes which are connected as peers to a logical ring via FreePastry. Each node consists of a registry to store -metadata and a file system to store physical file parts. Once connected to the ring each node contributes to the metadata space as well as file storage capacity, scaling the system dynamically with new node additions. Nodes use underline DHT (FreePastry) routing protocol to communicate efficiently with each other. -Fig. 1. Mahasen High Level Architecture -Mahasen uses a WSO2 registry and the file system in each node and DHT based architecture is used to connect the nodes to a one unit. -Mahasen has a distributed metadata layer that stores data about the distributed files in Mahasen peer to peer network. The metadata catalog is used to broker the stored resources in the network and to assist the user to locate the files in Mahasen distributed environment abstracting the metadata management from the user. -Mahasen stores two main types of metadata, which are system-defined metadata and user-defined (descriptive) metadata. System defined metadata is mainly used for server side resource handling. File name, file size, stored node IPs of file are examples of the system-defined metadata. User defined metadata is used to provide users the searching capability on those metadata. User can add tags and properties (name, value pairs) to the files that are uploaded. -Fig. 2. Metadata Object Structure of Mahasen -When a file is uploaded connecting to a Mahasen node the file will be temporarily saved in that node. Then the node will act as the master node and split the file into pre-defined sized chunks and the split parts are stored in a selected set of the neighborhood nodes of master node through parallel transfer. Then the metadata object created by master node will be stored with replicas using PAST storage implementation of Free pastry. We have rewritten PAST node’s persistent storage such that the data will be stored in the WSO registry in that node. -After storing the metadata, the nodes that received file parts act as worker nodes and replicate their file parts in parallel according to the replicate request issued by the master node. Each worker node will update the metadata object with stored locations of the file parts which were replicated after replicating their file parts using the capability of concurrent access to metadata objects, and Mahasen handles them using the locking system provided by the lock manager of DHT. -User can request to download a file from any Mahasen node and the node will first generate the resource ID for the requested and retrieve the metadata object. Then it extracts the locations of Mahasen nodes that contain the file parts from the metadata object and retrieve those parts to the local machine. The parts will be merged to create the original file after retrieving all the parts and the file will be streamed to the user. -Deletion can be performed with a single command across a heterogeneous storage system. When a delete request for a file is issued, by following the same method of retrieving the file, Mahasen finds nodes that store parts of the file and deletes them. Finally the metadata object will also be deleted with replicas -When user needs to update the user-defined metadata, the node that receives the update request retrieves the metadata object for the file from the DHT, updates it, and stores it back in the DHT. -. Using this model, Mahasen has built a complete decentralized metadata system that handles metadata management in a highly scalable and efficient manner. -Mahasen keeps replicas of both actual files and metadata objects. The main purpose of keeping replicas is for fault tolerance and failover recovery. We ensure the high availability of metadata while ensuring the scalability using free pastry’s underlying DHT. -3.2 Mahasen Search -When the amount of data in the system grows, the complexity of the search increases. Mahasen builds a distributed data structure using the underlying DHT, which can improve the performance of different search options that Mahasen supports. -The resources in Mahasen are associated with metadata and for each tag or property in system, we maintain an index pointing to all resources which have that tag or property. This is implemented as a TreeMap [16] and the property trees are stored in the DHT which handles replicas of it. -Fig. 3. A Property Tree Stored in Mahasen Memory Storage -When a user sends a search request, Mahasen extracts the requested search and initiate the execution of relevant search method. Then the resource IDs of the files which match with the given input are retrieved from the relevant property tree. Extracting the relevant resource IDs are done as follow. -Users can send search requests to any Mahasen node, and when a node receives a search request, Mahasen takes the property name given by the client and generates the property tree ID for that property. If the current node has the index for the property, it receives matching resource IDs for that property and sends them to the client. If not, the node acts as a master node and gets the node handles of the nodes which are having the specific property tree and routs Mahasen search messages with the required parameters to the node handles. Then those node handles will get the relevant resource IDs from the property trees in their memory storage and send back to the master node. -The property values in the property tree are sorted, so that if the search is a range based search, we can simply take the sub map between the initial and final property values and retrieve the set of resource IDs mapped to each of the node in the sub tree. Since these resource IDs represents the files having the given property values, Mahasen can look up for the metadata objects with those resource IDs and extract the file names to present to for the user. The operation of extracting the file names for the resource IDs has a high cost than extracting the matching resource IDs for the given search query. -Complete Data Structure built for Mahasen can support property based search, range based search, tag based search and Boolean operations for the properties such as AND operation and OR operation. The advanced search provided by Mahasen is capable of providing the search based on set of different properties and tags. -Mahasen Search utilizes the continuation model support by FreePastry in results retrieving and transferring. Therefore when a search request is issued, the application sends requests to look up node handles, which contain the particular TreeMap object to request results. Then the application will collect the first result incoming and resume action from the previous execution point. -3.3 File Handling -File Transfer. Mahasen is a network of storage nodes and users will be given a client which is the Mahasen Client to access and transfer files to the network. The Mahasen Client that is built using the Apache HttpClient [17] uses HTTP methods for transferring files to the network. First the client initiates a connection with one of the node in the network. An authenticated client is capable of uploading downloading, deleting, updating or searching for the files in the network. The File content will be added as an entity to the HTTP POST method and streamed to the target address. The receiving end will read the file stream and write it to the repository. -Replica Management. To achieve fault tolerance and failover recovery, the file will be split into a set of predefined chunks and each part will be replicated and stored in different nodes according to predefined replication factor. The placement of replicas is a critical part which affects the reliability and performance of the system. The purpose of having a policy for placement of replicas is for data reliability, availability, and network bandwidth utilization. The current policy of Mahasen is to store the replicated files in leaf nodes set to the initial node. The selection of nodes in the leaf set will be calculated using cost evaluation function which focus on the distance of the node. -After successfully transferring the file to the initial node, the client will be notified about the status of the file transfer and initial node will then replicate and transfer the file to other nodes. The number of copies kept for a file is called the replication factor of that file and will be decided by the Mahasen system. -File Splitting and Parallel transfer. Mahasen storage network is designed to store large files reliably across distributed nodes. When storing the file it will be split into blocks of fixed size and these blocks will be replicated across the network for fault tolerance. The transferring of replicated file blocks will be done in parallel to other nodes in order to utilize the bandwidth and to save time. -When focusing on the retrieval of a file by using the metadata object the system will then select a node which is closest to the reader node and download the blocks to the client. Downloading of file blocks will also be done in parallel and then the blocks will be merged to create the complete file. -3.4 Mahasen API -Mahasen provides a complete API to perform CRUD operations and search. Users can develop external clients apart from the default client Mahasen provides and integrate with existing systems to perform resource management and search operations. -3 Performance Analysis -The Mahasen System Scalability was tested by running a system with M nodes and N parallel clients. Here the value for M was 1, 6, 12, 18, 24 and N was 1, 5, 10, 15, 20. Each client carried out upload, download, delete and search operations for 10 times and the average was taken. The system configuration that was used in this test are, Two machines with Intel(R) Xeon(R) CPU E5-2403 1.80GHz 4 Core machines having 24GB RAM and One machine with Intel(R) Xeon(R) CPU E5-2470 2.30GHz 8 Core machines having 63GB RAM. Following Figures (from 4 to 7) depicts the results of this test. In the upload test, 500MB size files were used by each client. . -Fig. 4. Upload test results -In the results it is observed that when the number of client increases, the upload time is also increasing. We believe that this is due to the network congestion and background processes of data replication across nodes. When the number of nodes increased to 18 or 24, a reduction in upload time were observed. This was an expected behaviour, because the node which client selects to upload, distributes replica management task for other nodes in the p2p ring. -Fig. 5. Download test results -When download files using Mahasen client, it is observed that with the increase of number of client, the single node setup has a significant growth in the download time. In the performance test, a single node was chosen to send the client request while it coordinates the file transfer from other nodes in the setup. Therefore when there are multiple nodes in the system you can download file parts from other available nodes, which reduces the download time. -Fig. 6. Delete test results -When Mahasen performs a Delete on a resource, it involves 3 operations such as deleting metadata, deleting entries from search index, and deleting the physical file. When more nodes are in the system, each node can participate in deleting its own files in parallel, making the system more scalable and efficient. -Fig 7. Search test results -Search results illustrate that Mahasen can perform well even with more nodes added to the system. Usually single node should have the lowest possible time as it does not have to search across the p2p ring. But with multiple nodes, it has to aggregate results and present it to the client. This can be observed from the figure that, when more clients are in the system, results tend to converge into a lower value due to caching as we requested search operation through the same node. -3 Discussion and future work -Mahasen provides a highly scalable metadata structure with its peer-to-peer architecture in the metadata catalog. Unlike the existing metadata catalogs that use centralized architecture, Mahasen distributes metadata across the nodes in the system with the replication making the overall system scalable and fault tolerant. -Mahasen keeps replicas of both metadata objects and property trees as well. The DHT of FreePastry is used to store these objects in the system which provides easy access of them. Keeping replicas of metadata objects and property tree objects do not cost as much as keeping replicas of actual files which are very large in size compared to metadata and property tree objects. By having these objects with replicas in the system, Mahasen has been able to ensure the correct functioning of many of the Mahasen operations even in the conditions like node failures. -An important contribution of Mahasen is developing a distributed indexing structure on top of the DHT for searching data products using different properties associated with data products. Since Mahasen needed to support range based queries, we evaluated earlier effort to build such index structures. Skip Tree Graph [18] was one of the best candidates we selected for search assisting data structure, which can efficiently support range based queries over a DHT. Since we had different properties and data structure had to grow in two dimensions, one in number of properties and the other one in number of entries for one property we were forced to create different DHTs for different properties. Therefore we needed to evaluate a much less complex -solution since maintaining different DHTs could have been very expensive in terms of resources. -When the system scales up with the large number of nodes, it will be more costly to issue a search operation on the available raw metadata stored. Therefore Mahasen developed a combined data structure with DHT and TreeMap as explained earlier. -When a Mahasen node fails, and it is detected by the existing nodes in the network, Mahasen replicates all the metadata objects and the property tree objects which were in the failed node to the existing Mahasen node reading them from other replicas. Mahasen helps in preserving the availability of metadata objects and property tree objects by maintaining the replication factor of them a constant. -Current Mahasen design has several limitations, which we plan to handle as future works. Currently Mahasen stores each property indexes in one Mahasen node and assumes that it will fit within the memory of that node. This may not be major concern for simple cases, and even NoSQL storages like Cassandra makes similar assumptions. Dividing the property tree into parts and storing them in different nodes when it is larger than a given size can solve this problem. We can predefine the maximum size of a part that will be residing in one node. -Another challenge is that search based multiple properties where at least one is a common property would force Mahasen to join large data sets, and one potential solution is to negotiate the size of data sets before start the data merging. -To summarize, Mahasen project builds a scalable storage solution by making a group of existing open source registries work as a one unit. It provides a one logical global namespace, and users may talk to any node of the group and perform any operations. -Mahasen connects nodes (registries) using PAST, a storage overlay implemented on top of Pastry DHT algorithm. Furthermore, Mahasen builds a distributed indexing structure on top of DHT to support property-based search of data items. -A user can benefit from the Web Service API provided and effectively utilize for batch processing of file uploading task through a custom client or basic client provided by Mahasen. -References -1. Alexander, S., Szalay, Peter, Z., Kunszt, Ani Thakar, Jim Gray, Don Slutz, and Robert, J., Brunner.: Designing and Mining Multi-Terabyte Astronomy Archives.: The Sloan Digital Sky Survey. In: SIGMOD ’00 Proceedings of the 2000 ACM SIGMOD international conference on Management of data (2000) -2. Chaitanya Baru, Reagan Moore, Arcot Rajasekar, Michael Wan.:The SDSC Storage Resource Broker (1998) -3. Reagan, W., Moore.: Managing Large Distributed Data Sets using the Storage Resource Broker (2010) -4. G., DeCandia, D., Hastorun, and M., Jampani.: Dynamo.: Amazon’s Highly Available Key-value Store (2010) -5. Ghemawat, S.-T., Leun, and H., Gobioff.: The Google File System. -6. B., K., Nuno Santos.: Distributed Metadata with the AMGA Metadata Catalog. -7. Nirvana Storage - Home of the Storage Resource Broker (SRB®), http://www.nirvanastorage.com/index.php?module=htmlpages&func=display&pid=1 (2011) -8. XML Metadata Concept Catalog (XMC Cat), Data to Insight Center, Indiana University Pervasive Technology Institute, http://d2i.indiana.edu/xmccat. -9. Nirvana Performance, http://www.nirvanastorage.com/index.php?module=htmlpages&func=display&pid=54. -10. ApacheTM OODT, http://oodt.apache.org/ (2011) -11. WSO2 Governance Registry - lean.enterprise.middleware - open source SOA | WSO2, http://wso2.com/products/governance-registry/ (2011) -12. Apache Lucene - Overview, http://lucene.apache.org/java/docs/index.html. -13. HDFS Architecture Guide, http://hadoop.apache.org/docs/r1.0.4/hdfs_design.html (2011) -14. Pastry - A scalable, decentralized, self-organizing and fault-tolerant substrate for peer-to-peer applications, http://www.freepastry.org/. -15. P., Druschel and A., Rowstron.: PAST: A large-scale, persistent peer-to-peer storage utility. In: HotOS VIII, Schoss Elmau, Germany (2001) -16. TreeMap (Java 2 Platform SE 5.0), http://download.oracle.com/javase/1.5.0/docs/api/java/util/TreeMap.html (2011) -17. HttpClient - HttpComponents HttpClient Overview, http://hc.apache.org/httpcomponents-client-ga/ (2011) -18. Alejandra Gonz´alez Beltr´an, Paul Sage and Peter Milligan.: Skip Tree Graph: a Distributed and Balanced Search Tree for Peer-to-Peer Networks. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..cbe7e28c30547f3f0bd91bf26445ef0577f6563d Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Mahasen Distributed Storage Resource Broker.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-relation.txt index d54f7ddc463251b78178bfc3b81e55ec2ccf5216..505897ac09dc5a94a30e0abb2aca2aa4713809e7 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-relation.txt @@ -1,299 +1,204 @@ -OODT Filemgr User Guide -页面… CAS User Guides -跳到banner的尾部 -回到标题开始 -转至元数据结尾 -由 Thomas Bennett创建, 最终由 Nadeeshan Gimhana修改于 五月 17, 2019转至元数据起始 -The File Manager -An Overview of What is Installed -Configuring and Running the File Manager -Whats going to happen? -Now for some configuration -What have we configured? -How metadata is collected? -A brief overview of filemgr-client and query-tool -Command: filemgr-client -Command: query-tool -A Typical User Scenario -A few more tools -Tips and Tricks for FileManager -The File Manager -This self guided tutorial is intended for first time users. - -The fact that you've found this page, I assume that you are seriously thinking of using the OODT File Manager but are eager to get something up and running. It hopefully also means that you've checked out the code and built a cas-filemgr install target (e.g. a cas-filemgr-${version}-dist.tar.gz file). - -This tutorial is by no means a complete overview of all the File Managers functionality. However, it's an attempt to get you started using the basic tools. Like learning to drive a car, the most difficult part is getting it started and on the road! - -The following topics are covered on this page: - -An Overview of What is Installed -Configuring and Running the File Manager -A Typical User Scenario - ingesting and querying -An Overview of What is Installed -Assumption - you have built or have access to a cas-filemgr install target. This also means that you've correctly configured maven and java for your system. - -Here are the commands to install the cas-filemgr target from a tarfile. You will need to fit in the "..." with the appropriate content. - -$ mkdir -p /usr/local/oodt/ -$ tar xzvf .../filemgr/target/cas-filemgr-${version}-dist.tar.gz -C /usr/local/oodt/ -$ cd /usr/local/oodt/ -$ ln -s cas-filemgr-${version}/ cas-filemgr -The decompressed tar file creates a directory structure that looks as follows: - -. -├── bin -│ ├── filemgr -│ ├── filemgr-client -│ └── query-tool -├── etc -│ ├── filemgr.properties -│ └── mime-types.xml -├── lib -│ └── *.jar -├── logs -└── policy -| ├── cmd-line-actions.xml -| ├── cmd-line-options.xml -| ├── core -| │ ├── elements.xml -| │ ├── product-type-element-map.xml -| │ └── product-types.xml -| | -| ├── trace -| | ├── elements.xml -| | ├── product-type-element-map.xml -| | └── product-types.xml -| | -| ├── geo -| | ├── elements.xml -| | ├── product-type-element-map.xml -| | └── product-types.xml -| | -| (additional policy sub directories) -└── run -Please note, if you are using version 0.3 of OODT or earlier, the policy directory will look like this (with no sub directories): - -└── policy - ├── elements.xml - ├── product-type-element-map.xml - └── product-types.xml -Here is a brief description of each directory that you see listed: - -bin : contains shell convenience scripts for launching java classes -etc : contains configuration files, i.e. *.property and *.xml files -lib : contains java resources, i.e *.jar files -logs : contains file manager log files. -policy : contains product specifications, i.e *.xml specification files -The bin directory contains a number of executables: - -filemgr : file manager (startup/shutdown) script -filemgr-client : file manager client interface script -query-tool : catalog query tool -Configuring and Running the File Manager -You're now ready to run the file manager! - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr --help -Usage: ./filemgr {start|stop|status} -$ ./filemgr start -Whats going to happen? -The filemgr should be up and running, however, some WARNING messages may appear, complaining about configuration. - -If you get a java.net.BindException exception, make sure that no other service is running on port 9000. This is the port for an RPC interface that will be used for transferring data files into a repository. - -There's also a new file in the /usr/local/oodt/run directory. The file contains the filemgr process id. This is typical for *nix service house keeping. It is done to try and avoid running multiple filemgr services. - -There's also a new log file /usr/local/oodt/cas-filemgr/logs/cas_filemgr0.log. Tailing this file can often alert to you problems. - -$ tail -f /usr/local/oodt/cas-filemgr/logs/cas_filemgr0.log - -Now for some configuration -To do anything useful with your filemgr, you will need to specify some configurations in the /usr/local/oodt/cas-filemgr/etc/filemgr.properties file. - -Here is a basic modification to the filemgr.properties file: - -filemgr.properties -org.apache.oodt.cas.filemgr.catalog.lucene.idxPath=/usr/local/oodt/cas-filemgr/catalog -org.apache.oodt.cas.filemgr.repositorymgr.dirs=file:///usr/local/oodt/cas-filemgr/policy/core -org.apache.oodt.cas.filemgr.validation.dirs=file:///usr/local/oodt/cas-filemgr/policy/core -org.apache.oodt.cas.filemgr.mime.type.repository=/usr/local/oodt/cas-filemgr/etc/mime-types.xml -You will also need to specify a repository path in the product-types.xml file. Make sure that this path exists before you change the repository path xml element. - -product-types.xml - -Restart your filemgr so that it re-reads the filemgr.properties and product-types.xml: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr restart - -What have we configured? -A place to store your catalog, i.e. the database of metadata. -A place to store your ingested files, i.e. the repository. -The location of your policy directory for product specifications. -Your mime-types configuration file for file recognition. -How metadata is collected? -Now for some brief notes about how metadata is collected. The filemgr captures metadata in two different ways - from client side metadata extraction and server side metadata extraction. - -Client side metadata is passed to the filemgr via an xml formatted metadata file. E.g. a file called blah.txt can have a metadata file called blah.txt.met. This met file can be created in many ways, even by hand! And thats exactly what we're going to do. - -Server side metadata is generated by using java classes and the extractors that will be used are configured in the product-types.xml file in the chosen policy directory. For this example configuration, you should have /usr/local/oodt/cas-filemgr/policy/oodt as the policy directory, unless you're running version 0.3 or earlier of OODT, in which case you should have /usr/local/oodt/cas-filemgr/policy as the policy directory. - -Now would be a good time to have a quick look at the product-types.xml file. It contains some critical information about what is going to happen when we ingest our first file into the repository. - -Specified in the product-types.xml file, there is a default product type called GenericFile. This is the product type that we are going to use for the first file for ingestion. - -For the GenericFile type find the key. It's specifying some metadata. We're defining the product type! - -For the GenericFile type find the key. It's specifying some extractors to use for server side metadata extraction, namely: CoreMetExtractor, MimeTypeExtractor, FinalFileLocationExtractor. For more details about metadata and extractors see Metadata Extractors. - -If you're feeling curious, check out the other xml files in the /usr/local/oodt/cas-filemgr/policy subdirectories to get a better feel for how we define product types and elements. For a discussion of best practices w.r.t File Manager Policy, the reader is referred to Everything you want to know about File Manager Policy - -A brief overview of filemgr-client and query-tool -These commands are found in /usr/local/oodt/cas-filemgr/bin. - -Command: filemgr-client -In order to trigger a file ingestion we're going to use the filemgr-client. This is by no means the most automated way to ingest data into an repository, however it's a really easy and intuitive way to trigger a file ingestion. The filemgr-client is a wrapper script, making it easier to invoke a java executable from the command line. - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr-client --help -filemgr-client --url --operation [ [params]] -operations: ---addProductType --typeName --typeDesc - --repository --versionClass ---ingestProduct --productName --productStructure - --productTypeName --metadataFile - [--clientTransfer --dataTransfer ] - --refs ... ---hasProduct --productName ---getProductTypeByName --productTypeName ---getNumProducts --productTypeName ---getFirstPage --productTypeName ---getNextPage --productTypeName --currentPageNum ---getPrevPage --productTypeName --currentPageNum ---getLastPage --productTypeName ---getCurrentTransfer ---getCurrentTransfers ---getProductPctTransferred --productId --productTypeName ---getFilePctTransferred --origRef -As you can see there's a number of different ways this command can be executed. - -The first command line argument is --url. This is the location of the filemgr xml-rpc data transfer interface. Looking at the filemgr logs (specifically cas_filemgr0.log), we see an INFO statement telling us that local data transfer is enable on http://localhost:9000. This is the url that we need to specify. - -The second command line argument is --operation and there are 13 different types of operations that are possible! For now we are going to use the --ingestProduct operation. From the help command you can see that the --ingestProduct operation requires some further command line arguments to be specified. - -However, before we take a look at the --operation --ingestProduct, I would first like to shed a bit more light on the query-tool command. - -Command: query-tool -This is a very useful wrapper script to query the content of your repository. - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool -Must specify a query and filemgr url! -Usage: QueryTool [options] -options: ---url - Lucene like query options: - --lucene - -query - SQL like query options: - --sql - -query - -sortBy - -outputFormat -We see that we need to set some command line arguments to get anything useful out of the query tool. Try the next command: - -$ ./query-tool --url http://localhost:9000 --sql -query 'SELECT * FROM GenericFile' - -This should throw an exception, telling us it failed to perform a query. This is because there is no catalog yet (and therefore the GenericFile information does not exist). In fact if you have a look there is no catalog directory: - -$ ls /usr/local/oodt/cas-filemgr/catalog -ls: /usr/local/oodt/cas-filemgr/catalog: No such file or directory - -A Typical User Scenario -Time to ingest a very, very simple file. If you have not already, restart your filemgr so that it re-reads the filemgr.properties: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr restart - -For this simple ingestion we are not going to include any client side metadata, all the metadata collection will happen on the server side using the specified *Extractor extractors in the product-types.xml file. - -Create a text file and its metadata file for ingestion: -$ echo 'hello' > /tmp/blah.txt -$ touch /tmp/blah.txt.met - -Add the following xml to the /tmp/blah.txt.met file: - -blah.txt.met - - -Lets ingest the file! For --operation --ingestProduct we need to specify the following arguments: - ---productName : The name you want for your ingested product ---productStructure : Flat file or directory (i.e. hierarchical). Yes... we can ingest whole directories as one product ---productTypeName : A product type (as per product-types.xml) ---metadataFile : The client side metadata file ---refs : The product location -There's also an optional argument --clientTransfer, however, we're going to leave this and use the default local transfer. -[--clientTransfer --dataTransfer ] - -Here is the complete command: -$ ./filemgr-client --url http://localhost:9000 --operation --ingestProduct --productName blah.txt --productStructure Flat --productTypeName GenericFile --metadataFile file:///tmp/blah.txt.met --refs file:///tmp/blah.txt - -The output should look like: -Sep 16, 2011 2:09:42 PM org.apache.oodt.cas.filemgr.system.XmlRpcFileManagerClient -... -... -ingestProduct: Result: c2fbf4b9-e05c-11e0-9022-77a707615e7f - -You've just archived your first file (眨眼). - -To complete the process, lets see if we can retrieve the metadata. Run the query command again: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool --url http://localhost:9000 --sql -query 'SELECT * FROM GenericFile' - -The output should look like: -Sep 16, 2011 2:21:54 PM org.apache.oodt.cas.filemgr.system.XmlRpcFileManager complexQuery -INFO: Query returned 1 results -/var/archive/data/blah.txt,GenericFile,blah.txt,blah.txt,2011-09-16T14:09:43.405+02:00,c2fbf4b9-e05c-11e0-9022-77a707615e7f,Flat,text/plain,text,plain - -Check to see if the file has appeared in the archive: -$ ls /var/archive/data/blah.txt/ -blah.txt - -Query commands do not depend on the underlying catalog implementation. The --sql and --lucene instead describe the filemgr query syntax. - -At the time of writing this tutorial, composing queries using query-tool is not entirely straight forward, but entirely usable. Formatting of these queries is critical, small deviations from the syntax can result in the query return an unexpected value or throwing an exception. - -Some things to note about SQL queries: - -Use double quotes ("") for when specifying the SQL syntax. The single quote ('') is used for string values in a WHERE clause, e.g WHERE Filename='blah.txt' -Count the number of -- before each command line option. Some are -- and others are -. -The order of the return values for a search is not guaranteed unless you specify the \outputFormat option. -Here is a somewhat verbose example that uses all the SQL-like syntax that I am currently aware of (apologies for all the line breaks). - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool --url http://localhost:9000 --sql \ --query "SELECT CAS.ProductReceivedTime,CAS.ProductName,CAS.ProductId,ProductType,\ -ProductStructure,Filename,FileLocation,MimeType \ -FROM GenericFile WHERE Filename='blah.txt'" -sortBy 'CAS.ProductReceivedTime' \ --outputFormat '$CAS.ProductReceivedTime,$CAS.ProductName,$CAS.ProductId,$ProductType,\ -$ProductStructure,$Filename,$FileLocation,$MimeType' -The output should look like: -2011-10-07T10:59:12.031+02:00,blah.txt,a00616c6-f0c2-11e0-baf4-65c684787732, -GenericFile,Flat,blah.txt,/var/kat/archive/data/blah.txt,text/plain - -Now you can also check out some of the other 12 --operation possibilities for filemgr-client. For instance: - -$ ./filemgr-client --url http://localhost:9000 --operation --hasProduct --productName blah.txt - -Or: - -$ ./filemgr-client --url http://localhost:9000 --operation --getFirstPage --productTypeName GenericFile - -A few more tools -Cameron Goodale has written some useful command line tools aliases that are worth mentioning before we continue. See the following two web pages: https://issues.apache.org/jira/browse/OODT-306 -BASH and TCSH shell tools for File Manager - -Tips and Tricks for FileManager -Q: My Lucene Index Catalog is running slow now that I have over 100,000 products cataloged. How can I get the speed back? - -A: Run this command: -java -Djava.endorsed.dirs= org.apache.oodt.cas.filemgr.tools.OptimizeLuceneCatalog --catalogPath \ No newline at end of file +最终由 nadeeshan gimhana修改于 五月 17 , 2019转至元数据起始&what&AGGREGATION +we&configuration&依赖 +brief overview&a typical user scenario a few more tool tips and tricks&依赖 +brief overview&a typical user scenario a few more tool tips and tricks&依赖 +brief overview&a typical user scenario a few more tool tips and tricks&依赖 +brief overview&FileManager The File Manager&依赖 +brief overview&FileManager The File Manager&依赖 +brief overview&FileManager The File Manager&依赖 +brief overview&filemgr-client and query-tool command&AGGREGATION +brief overview&FileManager The File Manager&依赖 +brief overview&a typical user scenario a few more tool tips and tricks&依赖 +brief overview&a typical user scenario a few more tool tips and tricks&依赖 +brief overview&FileManager The File Manager&依赖 +self&first time user&依赖 +you&page&依赖 +tutorial&mean&依赖 +tutorial&all file managers functionality&依赖 +complete overview&all file managers functionality&AGGREGATION +you&basic tool&依赖 +topic&page&依赖 +your&system& +/ filemgr/target/cas-filemgr&directory structure&依赖 +/ filemgr/target/cas-filemgr&directory structure&依赖 +/ filemgr/target/cas-filemgr&directory structure&依赖 +/ filemgr/target/cas-filemgr&directory structure&依赖 +/ filemgr/target/cas-filemgr&directory structure&依赖 +/ filemgr/target/cas-filemgr&directory structure&依赖 +you&version 0.3&依赖 +version 0.3&OODT&AGGREGATION +policy directory&directory&GENERALIZATION +you&OODT&依赖 +policy directory&this (&依赖 +policy directory&sub directory&依赖 +└ ─ ─ policy ├ ─ ─ elements.xml ├ ─ ─ product-type-element-map.xml └ ─ ─ product-types.xml&directory&依赖 +you&that&依赖 +brief description&directory&AGGREGATION +bin&configuration file&依赖 +bin&i.e. *&依赖 +bin&configuration file&依赖 +bin&i.e. *&依赖 +jar file log&contains file manager log file&依赖 +bin directory&number&依赖 +bin directory&directory&GENERALIZATION +number&executable&AGGREGATION +bin directory&executable&依赖 +You&file manager&依赖 +catalog query tool&File Manager&依赖 +/&filemgr&GENERALIZATION +you&java.net.BindException exception&依赖 +other service&port 9000&依赖 +file&filemgr process id&依赖 +you&configuration&依赖 +your&filemgr& +You&repository path&依赖 +You&product-types.xml file&依赖 +/&/&GENERALIZATION +path&repository path xml element&依赖 +your&xml& +it&filemgr.properties and product-type&依赖 +we&what&依赖 +your&catalog& +database&metada&AGGREGATION +place&metada&依赖 +place&metada&依赖 +place&database&依赖 +place&database&依赖 +your&files& +place&repository&依赖 +place&repository&依赖 +location&policy directory&AGGREGATION +your&directory& +your mime-types configuration file&file recognition&依赖 +your mime-types configuration file&file recognition&依赖 +your mime-types configuration file&file recognition&依赖 +your mime-types configuration file&your mime-types configuration file&依赖 +your mime-types configuration file&your mime-types configuration file&依赖 +your mime-types configuration file&your mime-types configuration file&依赖 +Your&mime-types& +your mime-types configuration file&file recognition&依赖 +your mime-types configuration file&your mime-types configuration file&依赖 +metada&brief note&依赖 +metada&how&依赖 +filemgr&metada&依赖 +filemgr&client side metada extraction and server side metada extraction&依赖 +filemgr&two different way&依赖 +metadata file&file&GENERALIZATION +Client side metada&filemgr&依赖 +Client side metada&xml formatted metadata file&依赖 +a file call blah.txt&metadata file&依赖 +extractor&chosen policy directory&依赖 +extractor&product-types.xml file&依赖 +you&/ usr/local/oodt / cas-filemgr/policy&依赖 +you&example configuration&依赖 +you&have&依赖 +you&policy directory&依赖 +you&policy directory&依赖 +you&/ usr/local/oodt / cas-filemgr/policy&依赖 +version 0.3 or earlier&OODT&AGGREGATION +you&policy directory&依赖 +example configuration&configuration&GENERALIZATION +you&version 0.3 or earlier&依赖 +you&cas-filemgr/policy/oodt&依赖 +product-types.xml file&file&GENERALIZATION +we&repository&依赖 +our&file& +It&critical information&依赖 +we&first file&依赖 +It&going&依赖 +we&that&依赖 +GenericFile type& key&依赖 +It&metada&依赖 +We&product type&依赖 +GenericFile type& key&依赖 +It&extractor&依赖 +more detail&Metadata Extractors&依赖 +more detail&Metadata Extractors&依赖 +we&product types and element&依赖 +reader&discussion&依赖 +discussion&best practice&AGGREGATION +reader&Everything&依赖 +you&File Manager Policy&依赖 +reader&w.r.t File Manager Policy&依赖 +reader&best practice&依赖 +command&/ usr/local/oodt / cas-filemgr/bin&依赖 +brief overview&filemgr-client and query-tool&AGGREGATION +we&filemgr-client&依赖 +number&different way&AGGREGATION +location&filemgr xml-rpc data transfer interface&AGGREGATION +INFO statement&us&依赖 +we&that&依赖 +13 different type&operation&AGGREGATION +we&-- ingestProduct operation&依赖 +you&further command line argument&依赖 +I&light&依赖 +we&ingestProduct&依赖 +your&repository& +content&repository&AGGREGATION +/ query-tool Must&a query and filemgr url&依赖 +anything&query tool&AGGREGATION +SELECT&GenericFile& +query&SELECT& +it&us&依赖 +you&look&依赖 +a typical user scenario time&simple file&实现 +a typical user scenario time&a typical user scenario time&依赖 +a typical user scenario time&a typical user scenario time&实现 +a typical user scenario time&simple file&依赖 +it&$ cd / usr/local/oodt / cas-filemgr/bin $&依赖 +it&filemgr.properties&依赖 +we&client side metada&依赖 +metadata collection&product-types.xml file&依赖 +metadata collection&specified * Extractor extractor&依赖 +metadata collection&server side&依赖 +its&file& +tmp/blah&text file&依赖 +tmp/blah&text file&依赖 +tmp/blah&ingestion&依赖 +text file&file&GENERALIZATION +hello&tmp/blah& +tmp/blah&ingestion&依赖 +blah.txt.met let&file&依赖 +we&arguments :&依赖 +you&ingested product&依赖 +your&product& +we&product location&依赖 +we&whole directory&依赖 +we&one product&依赖 +/&tmp/blah&GENERALIZATION +output&sep 16 , 2011 2:09:42 pm org.apache.oodt.cas.filemgr.system.xmlrpcfilemanagerclient &依赖 +You&your first file (&依赖 +your&file& +we&metada&依赖 +Query&/&依赖 +output&sep 16 , 2011 2:21:54 pm org.apache.oodt.cas.filemgr.system.xmlrpcfilemanager complexquery info&依赖 +output&sep 16 , 2011 2:21:54 pm org.apache.oodt.cas.filemgr.system.xmlrpcfilemanager complexquery info&依赖 +file&archive&依赖 +Query command&catalog implementation&依赖 +-- sql and -- lucene&filemgr query syntax&依赖 +query&tutorial&依赖 +query&time&依赖 +query&tutorial&依赖 +query&time&依赖 +small deviation&unexpected value&依赖 +small deviation&exception&依赖 +Formatting&query&AGGREGATION +small deviation&query return&依赖 +small deviation&unexpected value&依赖 +small deviation&exception&依赖 +small deviation&query return&依赖 +single quote ('')&Count&依赖 +single quote ('')&WHERE clause&依赖 +single quote ('')&command line option&依赖 +=&Count& +single quote ('')&string value&依赖 +order&return value&AGGREGATION +you&\ outputFormat option&依赖 +verbose example&aware&依赖 +verbose example&SQL-like syntax&依赖 +CAS.ProductReceivedTime&\& +=&blah.txt& +sortBy&\& +WHERE&blah.txt& +a few more tool cameron goodale&some useful command line tool alias&依赖 +My&Catalog& diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-simEnts.txt deleted file mode 100644 index d54f7ddc463251b78178bfc3b81e55ec2ccf5216..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide-simEnts.txt +++ /dev/null @@ -1,299 +0,0 @@ -OODT Filemgr User Guide -页面… CAS User Guides -跳到banner的尾部 -回到标题开始 -转至元数据结尾 -由 Thomas Bennett创建, 最终由 Nadeeshan Gimhana修改于 五月 17, 2019转至元数据起始 -The File Manager -An Overview of What is Installed -Configuring and Running the File Manager -Whats going to happen? -Now for some configuration -What have we configured? -How metadata is collected? -A brief overview of filemgr-client and query-tool -Command: filemgr-client -Command: query-tool -A Typical User Scenario -A few more tools -Tips and Tricks for FileManager -The File Manager -This self guided tutorial is intended for first time users. - -The fact that you've found this page, I assume that you are seriously thinking of using the OODT File Manager but are eager to get something up and running. It hopefully also means that you've checked out the code and built a cas-filemgr install target (e.g. a cas-filemgr-${version}-dist.tar.gz file). - -This tutorial is by no means a complete overview of all the File Managers functionality. However, it's an attempt to get you started using the basic tools. Like learning to drive a car, the most difficult part is getting it started and on the road! - -The following topics are covered on this page: - -An Overview of What is Installed -Configuring and Running the File Manager -A Typical User Scenario - ingesting and querying -An Overview of What is Installed -Assumption - you have built or have access to a cas-filemgr install target. This also means that you've correctly configured maven and java for your system. - -Here are the commands to install the cas-filemgr target from a tarfile. You will need to fit in the "..." with the appropriate content. - -$ mkdir -p /usr/local/oodt/ -$ tar xzvf .../filemgr/target/cas-filemgr-${version}-dist.tar.gz -C /usr/local/oodt/ -$ cd /usr/local/oodt/ -$ ln -s cas-filemgr-${version}/ cas-filemgr -The decompressed tar file creates a directory structure that looks as follows: - -. -├── bin -│ ├── filemgr -│ ├── filemgr-client -│ └── query-tool -├── etc -│ ├── filemgr.properties -│ └── mime-types.xml -├── lib -│ └── *.jar -├── logs -└── policy -| ├── cmd-line-actions.xml -| ├── cmd-line-options.xml -| ├── core -| │ ├── elements.xml -| │ ├── product-type-element-map.xml -| │ └── product-types.xml -| | -| ├── trace -| | ├── elements.xml -| | ├── product-type-element-map.xml -| | └── product-types.xml -| | -| ├── geo -| | ├── elements.xml -| | ├── product-type-element-map.xml -| | └── product-types.xml -| | -| (additional policy sub directories) -└── run -Please note, if you are using version 0.3 of OODT or earlier, the policy directory will look like this (with no sub directories): - -└── policy - ├── elements.xml - ├── product-type-element-map.xml - └── product-types.xml -Here is a brief description of each directory that you see listed: - -bin : contains shell convenience scripts for launching java classes -etc : contains configuration files, i.e. *.property and *.xml files -lib : contains java resources, i.e *.jar files -logs : contains file manager log files. -policy : contains product specifications, i.e *.xml specification files -The bin directory contains a number of executables: - -filemgr : file manager (startup/shutdown) script -filemgr-client : file manager client interface script -query-tool : catalog query tool -Configuring and Running the File Manager -You're now ready to run the file manager! - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr --help -Usage: ./filemgr {start|stop|status} -$ ./filemgr start -Whats going to happen? -The filemgr should be up and running, however, some WARNING messages may appear, complaining about configuration. - -If you get a java.net.BindException exception, make sure that no other service is running on port 9000. This is the port for an RPC interface that will be used for transferring data files into a repository. - -There's also a new file in the /usr/local/oodt/run directory. The file contains the filemgr process id. This is typical for *nix service house keeping. It is done to try and avoid running multiple filemgr services. - -There's also a new log file /usr/local/oodt/cas-filemgr/logs/cas_filemgr0.log. Tailing this file can often alert to you problems. - -$ tail -f /usr/local/oodt/cas-filemgr/logs/cas_filemgr0.log - -Now for some configuration -To do anything useful with your filemgr, you will need to specify some configurations in the /usr/local/oodt/cas-filemgr/etc/filemgr.properties file. - -Here is a basic modification to the filemgr.properties file: - -filemgr.properties -org.apache.oodt.cas.filemgr.catalog.lucene.idxPath=/usr/local/oodt/cas-filemgr/catalog -org.apache.oodt.cas.filemgr.repositorymgr.dirs=file:///usr/local/oodt/cas-filemgr/policy/core -org.apache.oodt.cas.filemgr.validation.dirs=file:///usr/local/oodt/cas-filemgr/policy/core -org.apache.oodt.cas.filemgr.mime.type.repository=/usr/local/oodt/cas-filemgr/etc/mime-types.xml -You will also need to specify a repository path in the product-types.xml file. Make sure that this path exists before you change the repository path xml element. - -product-types.xml - -Restart your filemgr so that it re-reads the filemgr.properties and product-types.xml: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr restart - -What have we configured? -A place to store your catalog, i.e. the database of metadata. -A place to store your ingested files, i.e. the repository. -The location of your policy directory for product specifications. -Your mime-types configuration file for file recognition. -How metadata is collected? -Now for some brief notes about how metadata is collected. The filemgr captures metadata in two different ways - from client side metadata extraction and server side metadata extraction. - -Client side metadata is passed to the filemgr via an xml formatted metadata file. E.g. a file called blah.txt can have a metadata file called blah.txt.met. This met file can be created in many ways, even by hand! And thats exactly what we're going to do. - -Server side metadata is generated by using java classes and the extractors that will be used are configured in the product-types.xml file in the chosen policy directory. For this example configuration, you should have /usr/local/oodt/cas-filemgr/policy/oodt as the policy directory, unless you're running version 0.3 or earlier of OODT, in which case you should have /usr/local/oodt/cas-filemgr/policy as the policy directory. - -Now would be a good time to have a quick look at the product-types.xml file. It contains some critical information about what is going to happen when we ingest our first file into the repository. - -Specified in the product-types.xml file, there is a default product type called GenericFile. This is the product type that we are going to use for the first file for ingestion. - -For the GenericFile type find the key. It's specifying some metadata. We're defining the product type! - -For the GenericFile type find the key. It's specifying some extractors to use for server side metadata extraction, namely: CoreMetExtractor, MimeTypeExtractor, FinalFileLocationExtractor. For more details about metadata and extractors see Metadata Extractors. - -If you're feeling curious, check out the other xml files in the /usr/local/oodt/cas-filemgr/policy subdirectories to get a better feel for how we define product types and elements. For a discussion of best practices w.r.t File Manager Policy, the reader is referred to Everything you want to know about File Manager Policy - -A brief overview of filemgr-client and query-tool -These commands are found in /usr/local/oodt/cas-filemgr/bin. - -Command: filemgr-client -In order to trigger a file ingestion we're going to use the filemgr-client. This is by no means the most automated way to ingest data into an repository, however it's a really easy and intuitive way to trigger a file ingestion. The filemgr-client is a wrapper script, making it easier to invoke a java executable from the command line. - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr-client --help -filemgr-client --url --operation [ [params]] -operations: ---addProductType --typeName --typeDesc - --repository --versionClass ---ingestProduct --productName --productStructure - --productTypeName --metadataFile - [--clientTransfer --dataTransfer ] - --refs ... ---hasProduct --productName ---getProductTypeByName --productTypeName ---getNumProducts --productTypeName ---getFirstPage --productTypeName ---getNextPage --productTypeName --currentPageNum ---getPrevPage --productTypeName --currentPageNum ---getLastPage --productTypeName ---getCurrentTransfer ---getCurrentTransfers ---getProductPctTransferred --productId --productTypeName ---getFilePctTransferred --origRef -As you can see there's a number of different ways this command can be executed. - -The first command line argument is --url. This is the location of the filemgr xml-rpc data transfer interface. Looking at the filemgr logs (specifically cas_filemgr0.log), we see an INFO statement telling us that local data transfer is enable on http://localhost:9000. This is the url that we need to specify. - -The second command line argument is --operation and there are 13 different types of operations that are possible! For now we are going to use the --ingestProduct operation. From the help command you can see that the --ingestProduct operation requires some further command line arguments to be specified. - -However, before we take a look at the --operation --ingestProduct, I would first like to shed a bit more light on the query-tool command. - -Command: query-tool -This is a very useful wrapper script to query the content of your repository. - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool -Must specify a query and filemgr url! -Usage: QueryTool [options] -options: ---url - Lucene like query options: - --lucene - -query - SQL like query options: - --sql - -query - -sortBy - -outputFormat -We see that we need to set some command line arguments to get anything useful out of the query tool. Try the next command: - -$ ./query-tool --url http://localhost:9000 --sql -query 'SELECT * FROM GenericFile' - -This should throw an exception, telling us it failed to perform a query. This is because there is no catalog yet (and therefore the GenericFile information does not exist). In fact if you have a look there is no catalog directory: - -$ ls /usr/local/oodt/cas-filemgr/catalog -ls: /usr/local/oodt/cas-filemgr/catalog: No such file or directory - -A Typical User Scenario -Time to ingest a very, very simple file. If you have not already, restart your filemgr so that it re-reads the filemgr.properties: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./filemgr restart - -For this simple ingestion we are not going to include any client side metadata, all the metadata collection will happen on the server side using the specified *Extractor extractors in the product-types.xml file. - -Create a text file and its metadata file for ingestion: -$ echo 'hello' > /tmp/blah.txt -$ touch /tmp/blah.txt.met - -Add the following xml to the /tmp/blah.txt.met file: - -blah.txt.met - - -Lets ingest the file! For --operation --ingestProduct we need to specify the following arguments: - ---productName : The name you want for your ingested product ---productStructure : Flat file or directory (i.e. hierarchical). Yes... we can ingest whole directories as one product ---productTypeName : A product type (as per product-types.xml) ---metadataFile : The client side metadata file ---refs : The product location -There's also an optional argument --clientTransfer, however, we're going to leave this and use the default local transfer. -[--clientTransfer --dataTransfer ] - -Here is the complete command: -$ ./filemgr-client --url http://localhost:9000 --operation --ingestProduct --productName blah.txt --productStructure Flat --productTypeName GenericFile --metadataFile file:///tmp/blah.txt.met --refs file:///tmp/blah.txt - -The output should look like: -Sep 16, 2011 2:09:42 PM org.apache.oodt.cas.filemgr.system.XmlRpcFileManagerClient -... -... -ingestProduct: Result: c2fbf4b9-e05c-11e0-9022-77a707615e7f - -You've just archived your first file (眨眼). - -To complete the process, lets see if we can retrieve the metadata. Run the query command again: -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool --url http://localhost:9000 --sql -query 'SELECT * FROM GenericFile' - -The output should look like: -Sep 16, 2011 2:21:54 PM org.apache.oodt.cas.filemgr.system.XmlRpcFileManager complexQuery -INFO: Query returned 1 results -/var/archive/data/blah.txt,GenericFile,blah.txt,blah.txt,2011-09-16T14:09:43.405+02:00,c2fbf4b9-e05c-11e0-9022-77a707615e7f,Flat,text/plain,text,plain - -Check to see if the file has appeared in the archive: -$ ls /var/archive/data/blah.txt/ -blah.txt - -Query commands do not depend on the underlying catalog implementation. The --sql and --lucene instead describe the filemgr query syntax. - -At the time of writing this tutorial, composing queries using query-tool is not entirely straight forward, but entirely usable. Formatting of these queries is critical, small deviations from the syntax can result in the query return an unexpected value or throwing an exception. - -Some things to note about SQL queries: - -Use double quotes ("") for when specifying the SQL syntax. The single quote ('') is used for string values in a WHERE clause, e.g WHERE Filename='blah.txt' -Count the number of -- before each command line option. Some are -- and others are -. -The order of the return values for a search is not guaranteed unless you specify the \outputFormat option. -Here is a somewhat verbose example that uses all the SQL-like syntax that I am currently aware of (apologies for all the line breaks). - -$ cd /usr/local/oodt/cas-filemgr/bin -$ ./query-tool --url http://localhost:9000 --sql \ --query "SELECT CAS.ProductReceivedTime,CAS.ProductName,CAS.ProductId,ProductType,\ -ProductStructure,Filename,FileLocation,MimeType \ -FROM GenericFile WHERE Filename='blah.txt'" -sortBy 'CAS.ProductReceivedTime' \ --outputFormat '$CAS.ProductReceivedTime,$CAS.ProductName,$CAS.ProductId,$ProductType,\ -$ProductStructure,$Filename,$FileLocation,$MimeType' -The output should look like: -2011-10-07T10:59:12.031+02:00,blah.txt,a00616c6-f0c2-11e0-baf4-65c684787732, -GenericFile,Flat,blah.txt,/var/kat/archive/data/blah.txt,text/plain - -Now you can also check out some of the other 12 --operation possibilities for filemgr-client. For instance: - -$ ./filemgr-client --url http://localhost:9000 --operation --hasProduct --productName blah.txt - -Or: - -$ ./filemgr-client --url http://localhost:9000 --operation --getFirstPage --productTypeName GenericFile - -A few more tools -Cameron Goodale has written some useful command line tools aliases that are worth mentioning before we continue. See the following two web pages: https://issues.apache.org/jira/browse/OODT-306 -BASH and TCSH shell tools for File Manager - -Tips and Tricks for FileManager -Q: My Lucene Index Catalog is running slow now that I have over 100,000 products cataloged. How can I get the speed back? - -A: Run this command: -java -Djava.endorsed.dirs= org.apache.oodt.cas.filemgr.tools.OptimizeLuceneCatalog --catalogPath \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..b3e9d99d2ec738ef53f93efc322e0a20c13c1e42 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/OODT Filemgr User Guide.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-relation.txt index 1c22f3309dfbbc720b09c3f5d59af9540e8c183b..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-relation.txt @@ -1,26 +0,0 @@ -Catalog and Archive File Management Component 0.12 API -Packages -Package Description -org.apache.oodt.cas.filemgr.catalog -org.apache.oodt.cas.filemgr.catalog.solr -org.apache.oodt.cas.filemgr.cli.action -org.apache.oodt.cas.filemgr.datatransfer -org.apache.oodt.cas.filemgr.exceptions -org.apache.oodt.cas.filemgr.ingest -org.apache.oodt.cas.filemgr.metadata -org.apache.oodt.cas.filemgr.metadata.extractors -org.apache.oodt.cas.filemgr.metadata.extractors.examples -org.apache.oodt.cas.filemgr.repository -org.apache.oodt.cas.filemgr.structs -org.apache.oodt.cas.filemgr.structs.exceptions -org.apache.oodt.cas.filemgr.structs.query -org.apache.oodt.cas.filemgr.structs.query.conv -org.apache.oodt.cas.filemgr.structs.query.filter -org.apache.oodt.cas.filemgr.structs.type -org.apache.oodt.cas.filemgr.structs.type.examples -org.apache.oodt.cas.filemgr.system -org.apache.oodt.cas.filemgr.system.auth -org.apache.oodt.cas.filemgr.tools -org.apache.oodt.cas.filemgr.util -org.apache.oodt.cas.filemgr.validation -org.apache.oodt.cas.filemgr.versioning \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-simEnts.txt deleted file mode 100644 index 1c22f3309dfbbc720b09c3f5d59af9540e8c183b..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action-simEnts.txt +++ /dev/null @@ -1,26 +0,0 @@ -Catalog and Archive File Management Component 0.12 API -Packages -Package Description -org.apache.oodt.cas.filemgr.catalog -org.apache.oodt.cas.filemgr.catalog.solr -org.apache.oodt.cas.filemgr.cli.action -org.apache.oodt.cas.filemgr.datatransfer -org.apache.oodt.cas.filemgr.exceptions -org.apache.oodt.cas.filemgr.ingest -org.apache.oodt.cas.filemgr.metadata -org.apache.oodt.cas.filemgr.metadata.extractors -org.apache.oodt.cas.filemgr.metadata.extractors.examples -org.apache.oodt.cas.filemgr.repository -org.apache.oodt.cas.filemgr.structs -org.apache.oodt.cas.filemgr.structs.exceptions -org.apache.oodt.cas.filemgr.structs.query -org.apache.oodt.cas.filemgr.structs.query.conv -org.apache.oodt.cas.filemgr.structs.query.filter -org.apache.oodt.cas.filemgr.structs.type -org.apache.oodt.cas.filemgr.structs.type.examples -org.apache.oodt.cas.filemgr.system -org.apache.oodt.cas.filemgr.system.auth -org.apache.oodt.cas.filemgr.tools -org.apache.oodt.cas.filemgr.util -org.apache.oodt.cas.filemgr.validation -org.apache.oodt.cas.filemgr.versioning \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..f8af3037fb4043ef401be08ca2007838c77a1e74 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/Package org.apache.oodt.cas.filemgr.cli.action.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-relation.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-relation.txt index 600b501136a12f398d7b5d1d04cd146a9e6d4d6e..08f288bc70551a07ef06090422da9acb09958dbd 100644 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-relation.txt +++ b/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-relation.txt @@ -1 +1,770 @@ -react file manager All components included in this dashboard template has been developed to bring all the potential of HTML5 and Bootstrap plus a set of new features (JS and CSS) ideal for your next dashboard admin theme or admin web application project. 확장성을 보유할것 외부 프로젝트에서도 Description. Redux helps you write applications that behave consistently, run in different environments (client, server, and native), and are easy to test. It has a beautiful design, as you can see from the live previews and it contains a LOT of components and features. filemanager namespace exposes the FileManagerCommand class that could be extended to implement a custom File Manager command. Just display a list within 2 predifined tabs (folders). Create React App - TS docs; Next. The Edit screen with option to select one or more files is displayed. Add Start script to package. 9,676 4. Inbuilt Search textbox in FileManager: See Also. Module files are represented in the design manager in a multi-pane module editor. To include the File Manager component in application import the FileManagerComponent from ej2-react-filemanager package in App. 1 - 28 November 2020 ----- - Upgraded Bootstrap version to 4. Filemanager with React & Nodejs . prod. The download manager handles HTTP connections, monitors connectivity changes, reboots, and ensures each download completes successfully. I would like this shortcode to be dynamic, i. Grab the demo from Github if you haven't done this yet. Mobile applications definitely offer a greater value to businesses than their mobile website In this tutorials we will use a package named @react-native-community/checkbox to add checkboxes in react native. Express your opinions freely and help others including your future self You can customize Storybook's webpack setup by providing a webpackFinal field in . 4. 3. import React from 'react'; import 'devextreme/dist/css/dx. js, Express and TypeScript. Install the React components and choose a theme that suits your needs. target. 2 - Ability to translate Wireframes and PSD Designs into functional web apps using HTML5, React , Node. In this tutorial you will learn how to create a working file upload component with react from scratch using no dependencies other than react itself. 2/6. /. You can rearrange the order of your files by dragging them around to move the important files to the top of the list for faster access. Run the Drupal Page having React Nested modals aren’t supported, but if you really need them the underlying react-overlays can support them if you're willing. Download the corresponding App Center SDK for iOS frameworks provided as a zip file and unzip it. NET Core suite along with 100+ fully-featured UI components designed to speed up delivery & improve every aspect of target. You’ll see a plus symbol to the left of the file or folder. com and its affiliated web properties is provided "as is" without warranty of any kind. An electron based file manager. The file manager application is like the heart of a smartphone. Modal's "trap" focus in them, ensuring the keyboard navigation cycles through the modal, and not the rest of the page. Async uploading with AJAX, or encode files as base64 data and send along form post. Overview of Kendo UI FileManager; Sort in Kendo UI FileManager; Toolbar Commands in Kendo UI FileManager Express your opinions freely and help others including your future self I am a beginner in react. LibraryManager. Developed with the latest jQuery plugins. html and . Complete file and folder manager: Create, rename, move and delete a folder. It's very important for me your collaboration on my development tasks and time. Test your JavaScript, CSS, HTML or CoffeeScript online with JSFiddle code editor. Go to react_code\src and change the apiUrl inside config. js as per your current url of Drupal. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8192ee_new. Say “EDIT MODE”. 1. To enable profiling in production mode, modify Webpack configuration file (config/webpack. To download and start utilizing Syncfusion's Essential Studio for React components, see our pricing model. A simple file manager built with react. azurewebsites. 0. Vue. Updated laravel 7 to all full version and starter-kit; React Fixed. mp4 videos to the server. Learn to build modern web applications using Angular, React & Vue! File Upload Component with Vue. A file input (dropzone) management component for React. TIP: If you have never seen a dot file (a file starting with a dot) it might be odd at first because that file might not appear in your file manager, as it’s a hidden file. Unlike vanilla Bootstrap, autoFocus works in Modals because React handles the implementation Free download Filedash – File Manager Dashboard Nulled. Maybe later i can have a button to have the grid view File-Manager 개발정의서 들어가며 본 문서는 인수인계 목적이 아닌 개발이 완료된 제품에 대한 이해를 돕기위해 제작된 개발 정의서입니다. Say for instance that you want to open the file select dialogue for a user to select an file to upload. This sample demonstrates how to utilize the Amazon S3 file system provider to manage the files in File Manager component. Deploy Trillo File Manager from GCP Marketplace In this tutorials we will use a package named @react-native-community/checkbox to add checkboxes in react native. Choose a device definition, Nexus 5X is suggestable. Create a new project with React Native. React also allows us to create reusable UI components. Use Git or checkout with SVN using the web URL. bs4 File Manager. v6. js . Source code: https://bit. Learn more . log(event. Also, you might want to customize the look of the file input in the form to make it resonate with your overall app design. Chocolatey is trusted by businesses to manage software deployments. Conclusion Let’s work that out. Scheduler. svg'; // replace it with your path // Profile upload helper const HandleImageUpload = => { // we are referencing the file input const imageRef = useRef(); // Specify the default image const [defaultUserImage React Filemanager. Download Nulled Filedash – File Manager Dashboard. 1 - Added new Scrollable layout. x, Columns: 4+. FTP Access Upload files via FTP Need easier and faster way to upload and download. The default locale of the file manager is en (English). In our editor / file manager we should see a . It is distributed through NPM under the kendo-react-upload package. It was initially called Filer but was changed to Thunar due to a name clash. It can be used as a standalone app or as a middleware. I will try to make it clean and retro-compatible with the previous bridges/connectors. When it comes to both of these issues, React can help you provide a better user experience. Themes and Skinning JavaScript - jQuery, Angular, React, Vue React Data Grid. Build files will be created build. Select, Copy, Paste, and Delete. Python dictionary add, delete, update, exists keys with performance; java. react-dropzone is a React’s implementation of popular drag and drop library for file uploading. And in our opinion, the Webix library offers the best solution available on the market. Free bootstrap snippets, examples and resources built with html, css and js. Adding React File Manager for PDF Library In the previous section, we added the File Server Node API component from filemanager (by -OpusCapita) . So follow the below setups:- 1) Install the @react-native-community/checkbox package like below in your application 2) Link the module in your application 3) Import Free React Design System For Bootstrap 4 (reactstrap) 9,824 4. You can add spans to any grid element, fine-tune the table sizes, specify the columns’ auto width, and freeze one or more columns. Folder based file browser given a flat keyed list of objects, powered by React. 10. Client React connector for Google Drive API v2; Detailed documentation for each package is coming soon. You can fire up the project with dotnet run to see what the scaffold does for you. /assets/images/defaultUser. It is fully responsive, built with Bootstrap 4 Framework, HTML5, CSS3 and SCSS. The editor is the whole CodeSandbox application (file manager, code editor, dependency settings) and the preview is the result you see on the right. Extension for Visual Studio Code - Simple extensions for React, Redux and Graphql in JS/TS with ES7 syntax Another file format that uses the MD file extension is Moneydance Financial Data. import React, { useEffect, useRef, useState } from 'react'; // Specify camera icon to replace button text import camera from '. - Added Blog List, Blog Grid, Blog Details pages. Site Navigation and Layout. React Native This is an exact mirror of the React Native project, A lightweight and easy-to-use password manager Clonezilla. All the operating systems got a file manager to filter the required files. Create React App – How to Create and Deploy a React Application to Production. and we can drill down into various modules. 5. 0 To do so, right-click the libman. By default, Storybook's webpack configuration will allow you to: Import Images and other static files Semantic UI React provides React components while Semantic UI provides themes as CSS stylesheets. mov, . (Ex – Facebook, Twitter and Google. Then add the File Manager component as shown in below code example. Web. View . This is one of the admin tools that our customers manage their static files on shared host. Initially, the selectedFilestate is set to null The FileBrowser dialogs consist of a FileBrowser object, an object previewer/property manager and a file uploader tab. Let’s install Bootstrap and React. ” - [source] You can delete the files from My Media, My Documents or My Photos folders. Managing your React. Site Navigation and Layout. This can be done in one of two ways: Run bower install --save for each package (the --save flag adds the dependencies (name and version) to the bower. fThe file is called : "FirstReactApp. bs4 File Manager. A partition and disk Adds React debugging tools to the Chrome Developer Tools. This is a Sample React Plugin for Apache OODT File Manager. The FileBrowser dialogs consist of a FileBrowser object, an object previewer/property manager and a file uploader tab. Thunar is designed to start up faster and be more responsive than some other Linux file managers, such as Nautilus and Konqueror. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8812ae_new. This is an example file with default selections. new file manager windows 10 Executive Summary These course materials were originally designed for Google managers to help them transition from an individual contributor role to a manager role. jsx". dll and React. filebrowser provides a file managing interface within a specified directory and it can be used to upload, delete, preview, rename and edit your files. Files. json must be written in JSON. I will try to make it clean and retro-compatible with the previous bridges/connectors It's very important for me your collaboration on my development tasks and time. They are not part of the template and NOT included in the final purchase files. So in the above imports, the files would be CartTotal. In XCode, in the project navigator, right click Libraries Add Files to [your project's name] Go to node_modules react-native-file-manager and add the . LibraryManager. You can go for either an HTML5 drag & drop file uploader or use the traditional way. json file is saved. I want somebody who can redo what has been done and finish it. On the file manager for a domain you have a ‘public_html’ folder. dll (if using MVC 4) in your Web Application project Your first build always needs to be done using the build script ( dev-build. The FileBrowser provides the ability to browse directories and locate a file item. Downloading the file. cs ). ) Wrap long file names in the File Manager’s detail view Customize icons in the Folder Tree. com, lands you with the opportunity of working with a leading technology organization. v 2. You Install react-file-reader (A flexible ReactJS component for handling styled HTML file inputs. files[0]) } On saving, create-react-app will instantly refresh the browser. babelrc configuration file. Looking for the best react native app development companies? Here is the list of the top React native developers with reviews by ADA. Unfortunately it can be quite intimidating. if I add/remove files in File Manager, it will react dynamically on the front-side (so I don’t need to modify the shortcode or put a React is a popular open-source JavaScript library – many of you asked for an easier integration between Power BI and React web applications. Free bootstrap snippets, examples and resources tagged with file-manager, html, css and js. Themes and Skinning JavaScript - jQuery, Angular, React, Vue React Data Grid. Please help me to move forward with a donation by paypal :) The file manager component is used to browse, manage, and organize the files and folders in a file system through a web application. js. js file to a . Build) to the project, which will trigger a restore as part of project build. json file and choose “Enable Restore on Build”. Any FOSS lover is warmly welcomed A lot of people name React components with a capital letter in the file, to distinguish them from regular JavaScript files. To delete one or more files, 1. When viewing a module locally, the files are contained within module-name. Input A file input management component for React. WP Media Folder v5. Upload React website to subdomain Open File Manager Create new folder inside “public_html” Upload whole content of “build” folder into this new created folder. The “React JS Developer” role at one. How can I create a custom command for the Kendo UI File Manager? Creating a Custom Command. Restore on demand Library Manager will restore client-side libraries whenever the libman. It allows the creation of multiple users and each user can have its own directory. In this tutorial we are going to create a task manager application from scratch with react. Once a file item is selected, it (or its properties) is loaded in the previewer. The DevExtreme JavaScript FileManager component allows you to display and manage files and directories for different file systems. To make the functions work as expected, I transpile these into CommonJS format in addition to transpiling React JSX files. 부디 도움이 되길 바랄 뿐입니다. It's a command-line utility connected with the corresponding online repository of packages and is capable of package installation, version management, and dependency management. Step 9: Configuring AVD Manager. This time with Trillo File Manager is an application for Dropbox-like functionality on the top of the GCS cloud storage bucket. However, you don’t want to use the standard file input HTML element, instead use a styled link or button to show the file window. 3 - Binding of UI elements to JavaScript object models. 2. React is one of the best choices for building modern web applications. js file. Default configuration. This is a Sample React Plugin for Apache OODT File Manager. React has a slim API, a robust and evolving ecosystem and a great community. If nothing happens, download Xcode and try again. com and its affiliated web properties is provided "as is" without warranty of any kind. After downloading the installation file of it, double click on it and proceed with the installation. The KendoReact Upload component is part of the KendoReact library of React UI components. common. Say “EDIT MODE”. json React Component by Creating. You can delete the files from My Media, My Documents or My Photos folders. The new React component supports both JavaScript and TypeScript and will help you embed your analytics in a React web application. Our file caching system will have two main parts. Tailor fman to your needs with its powerful plugin system. React can handle a single button, a few pieces of an interface, or an app's entire user interface. js. Requirements Creating a File Upload Component with React. react-native-azure-blob-storage-manager. npm install react-files --save Usage Basic I don't think there is one, but it's such a strange question, React is used on the web as a frontend library, while a file manager runs on your local computer. Then use the Axios library to send the file request to the Laravel server and saves the image in the server. ReactOS is a free and open-source operating system for x86/x64 personal computers intended to be binary-compatible with computer programs and device drivers made for Windows Server 2003. Component { render() { return ( ); } } export default App; File Manager can be initialized using the tag. npm start To create a new build inside dist directory. Disclaimer: The information provided on DevExpress. Download Manager is a system service which optimizes the handling of long-running downloads in the background. The first is a React component, which will wrap around RNFetchBlob’s functionality and respond to changes in the Redux store. This method of deleting corrupted files requires you to close "Windows Explorer" through "Task Manager". json. A list of its key features is given below. The MD file stores transactions, budgets, stock information, bank accounts, and other related data for the Moneydance finance software. The second is a set of actions and reducers on the Redux store which deal specifically with file caching. apiOptions, apiRoot: `http://opuscapita-filemanager-demo-master. Integrate TinyMCE editor in Laravel with a File Manager / Image Upload Jquery PHP July 27, 2019 2,006 views Create Sortable, drag and drop multi-level list with jquery like wordpress menu page All Webix widgets and applications function well in the Angular or React environment. Today we will create File Manager App UI with animation using React Native. Sweet Alert in dark layout; Design Files Removed. Installation. Free bootstrap snippets, examples and resources tagged with file-manager, html, css and js. . dll. When a TypeScript script gets compiled there is an option to generate a declaration file (with the extension . You have high end protection; It also has a file manager that is easy to access. With React, you can create reusable components that are independent of each other. Use the Download button in the toolbar. The following table represents the default texts and messages of the file manager in en culture. This project based course will introduce you to all of the modern toolchain of a React developer in 2020. For example, users can upload images, videos, etc on Facebook, Instagram. Source + Demo. Data List; React DataTable Component Vue based front-end for File Manager Aug 01, 2018 1 min read. To associate your repository with the react-electron topic, visit your repo's landing page and select "manage topics. . 6. Benefits of Hosting React. This will add the LibraryManager NuGet package (Microsoft. Add events to precisely control file/folder operations (folder creation, file uploading, moving, deleting, etc. 6. Go through the following steps for creating React project to download file from server using React. A full list of the compatible frameworks and integration examples you can find on this page . Files and folders in the file system can be sorted in either ascending or descending order simply by npm install --save @opuscapita/react-filemanager @opuscapita/react-filemanager-connector-node-v1. If nothing happens, download GitHub Desktop and try again. import ReactFileReader from 'react-file-reader'; class Because the other files & folders above (some truncated) are usually part of a default react-native init installation, our focus would be on the src folder:. tsx. 5, npm 6. Drag & Drop your files in folders: Drag & Drop and image to move it into a folder, where you can find other files. To configure the AVD Manager click on the respective icon in the menu bar. These first have been selected by most active users and ranking has been given based on the most popular votes. Accessible , tested with AT software like VoiceOver and JAWS, navigable by Keyboard . changing a . target. Install from NPM and include it in your own React build process (using Browserify, Webpack, etc). The project is about uploading a users products/services. The name npm (Node Package Manager) stems from when npm first was created as a package manager for Node. If multiple packages depend on a package - jQuery for example - Bower will download jQuery just once. React. View . Communicating react with asp. holyidiot updated Vuexy - Vuejs, React, HTML & Laravel Admin Dashboard Template with a new update entry: Update [6. 0] – 2020-11-28 Latest Update [6. ts) that functions as an interface to the components in the compiled JavaScript. Added 2021-01-09 file-manager,file-browser spofly Desktop app to find lyrics of currently playing song on spotify. You have two options for creating a file uploader. ReactOS will only be compatible with computers that are compatible with Windows 2003 or XP. ej2-react-filemanager. Basic usage. Any FOSS lover is warmly welcomed React Native ; Bootstrap file-manager examples. Bower provides hooks to facilitate using packages in your tools and workflows. jsx" Select Web => JSX File, and enter file name "FirstReactApp. The FileManager provides an inbuilt Search functionality, allowing you to find the specific file in the currently selected folder. React File Manager: A Lightweight & Customizable Component File upload and download. File Browser Front-end. Paper Kit ReactOS is a free, opensource reimplementation of windows Related: How to Copy and Paste Text, Files and Folders in Linux Terminal. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. Removed complementary design files from the package [5. Uploading Files using HTML5 Uploader. TagSpaces features basic file management operations, so it can be used as simple file manager. To associate your repository with the react-electron topic, visit your repo's landing page and select "manage topics. dhtmlxGrid contains rich API functionality. This is an unparalleled array of features, design elements and reusable components Introduction to Asus file manager. To do so, right-click the libman. /assets/images/camera. Try All UI Components for Free All UI components for React are part of the dhtmlxSuite library. Changing the Webpack config. Add multiple URL to pocket at a time. Create better React apps faster and add data visualizations with the world's fastest, virtualized, real-time React data grid and streaming financial and business charts. File uploading means a user from a client machine wants to upload files to the server. net core on remote server (httpdocs folder) does not work RSS 0 replies Last post 2 hours, 59 minutes ago by fiazahmed An electron based file manager. 10 – WordPress File Manager Using the default WordPress media manager also means that the plugin will be very compatible with all the other plugins you use. It’s not a visual file manager, but it gives a set of functions to easily handle media/files in your Laravel app. html’ file to run on start up. Once a file item is selected, it (or its properties) is loaded in the previewer. Read the full article at: http://bgwebagency. The presence of these handlers enables the buttons and/or the drag & drop responsiveness. (eg. However, newer versions of the program use . The FileManager uses file system providers to access file systems. fastlane/ This folder, as you might React doesn’t have opinions on how you put files into folders. files[0]) } On saving, create-react-app will instantly refresh the browser. No action needed. pdf file to use a . Firebase issue during npm install [5. The standard tool for this task is Babel. The create-react-app utility configures tools such as Babel and webpack for the client-side React application. Build) to the project, which will trigger a restore as part of project build. In this section, we are going to add the Client React component from OpusCapita for navigating the folders and listing the files in our PDF library. nyc_output and coverage folder containing our instrumentation detail. . In terms of frontend frameworks, this React admin dashboard is powered by Material-UI, which is the most popular material-based UI components framework available today. - Added File Manager Page. MONEYDANCE files instead. Blog Post. Page 16. lang. A file input (dropzone) management component for React. Videos: Manage member level settings & Videos created by Users. storybook/main. ) Channels: Manage member level settings & Channel created by Users. It enables the user to perform common file operations such as accessing, editing, uploading, downloading, and sorting files and folders. To upload a file with React and Laravel, create a React file component and backend in Laravel. 2. React Chart. The following sample is extracted from Android 4 ICS emulator image. Similarly, every smartphone has a file manager to view, edit, and create any text files, delete, sort, or rename, copy, and cut whenever required. All npm packages are defined in files called package. ). The official front-end framework for building experiences that fit seamlessly into Microsoft 365. Create shortcuts for files: Hold SHIFT and move a file with drag & drop to another folder in order to create a shortcut Bootstrap snippets. 1. js file. To select a specific file, you need to use the number assigned to it. See bundler defaults for the full list. Bower is optimized for the front-end. 15. React. This allows teams to set conventions that work best for them, and to adopt React in any way they would like to. It also supports uploading a file by dragging it from Windows Explorer to FileManager control. Cezerin is open-source ecommerce platform. Tailor your React grid component according to your needs. dll GitHub - networknt/react-file-manager: A react remote file manager with Light Framework as back end file system. Social Sites Integration: With one click, you can login to your site using Social Sites. Using the arrow keys, move over the desired file or folder and press Space on the keyboard. 9 all file uploads, including those initiated by the File Browser plugin, expect a JSON response (like this one ). In addition to building skills, this curriculum incorporates introspection, perspective shifting, and awareness building. These two parts are very decoupled and only communicate using postMessage. Plugin has two parts: Front-end & Back-end. Note that your changes would be temporary and will not persist between re-runs of your package manager. Other Feature Module in React template: Voice and Video Call File Manager Contacts and Email Departments and Designations Timesheet and Overtime Kanban Board Payroll, Payslip and Payrun Company Policies Performance, Goal Tracking, Training and Promotion Resignation and Termination Faq and Knowledgebase Profile Settings, Profile and Edit Profile 🎈 React Material-UI. August 08, 2018. Store the file in state, and only upload when a user clicks the upload button. Overview. This package help you to upload file and assests from react native project to your azure blob storage service. If you want to download large files/streaming you can use Android Download Manager. React components can be installed via yarn or npm: After install, import the minified CSS file in your app's entry file: File manager built with the help of Suite components: Layout, Grid, DataView, Toolbar, etc. js doesn’t have to be hard and with these few steps, you can do it yourself. js'; class App extends React. Use npm to install the Expo CLI command line utility from the Windows Command Prompt, PowerShell, Windows Terminal, or the integrated terminal in VS Code (View > Integrated Terminal). Clear the cache from admin panel. js , or Total. How you use packages is up to you. html file should reveal our code coverage in a human readable and hopefully revealing way. We will cd into react project react-file-upload – cd react-file-upload Now will install dependencies – npm install bootstrap npm install react-toastify npm install axios The bootstrap help to create ui based on bootstrap 4, react-toastify is use to display beautiful notification into react app and axios for HTTP client. Save time by quickly jumping to directories. CKEditor 4 can be easily integrated with an external file manager (file browser/uploader) thanks to the File Browser plugin which by default is included in the Standard and Full presets. A very smart filemanager to manage your files in the browser developed in AngularJS following Material Design styles by Jonas Sciangula Street. These files will always be rendered/loaded to the page when an instance of the module is on the page (Module instances are the individual rendered modules on the page). FileManager Conclusion To make a file downloadable from your website, start by creating a folder on your server for both your website's HTML page and the file you want to share. Select the file in the manager. It is often used for developing Web Applications or Mobile Apps. mp3, . Edit: But As a web app. 2. You can add a custom thumbnail and text description to every file or folder. That said there are a few common approaches popular in the ecosystem you may want to consider. changing a . Drag and Drop Support in React FileManager component 23 Feb 2021 / 1 minute to read The file manager allows files or folders to be moved from one folder to another by using the allowDragAndDrop property. Once you make the folder, you can find it by using your Control Panel's file manager or the file browser in your FTP program. In XCode, in the project navigator, select your project. files[0]holds the actual file and its details. react-file-manager. React Native ; Bootstrap file-manager examples. Download the App Center SDK for React Native frameworks provided as a zip file and unzip it. React, Node v12. A dual-pane file manager for Mac, Windows and Linux. Restore on demand Library Manager will restore client-side libraries whenever the libman. html, the rest of the website build in React shows nothing (white page). Set Api path for React. To connect the component with the file system items, assign the Remote File System Provider to the fileSystemProvider property. React File Manger Multi-column File Manager based on react-beautiful-dnd. React JS Developers one. js. Here are some of the best places to find up-to-date information on React and TypeScript: React TypeScript Cheatsheets React is a JavaScript library that aims to simplify development of visual interfaces. svg'; // replace it with your path // Specify your default image import defaultUser from '. json. File Manager. css'; import 'devextreme/dist/css/dx. js under dist/ directory. /. The FileBrowser provides the ability to browse directories and locate a file item. Chocolatey is software management automation for Windows that wraps installers, executables, zips, and scripts into compiled packages. React Chart. npm run build. files[0]holds the actual file and its details. This one is a little different. All of the files shared are under GPL License. Like a photo, pdf or any other file type. 3] - 2020-04-04 VueJS + Laravel, HTML + Laravel Updated. in/building-a-full-st Related Posts: How to download file from server using Angular; Prerequisites. Cronus File Manager Live Demo. Files files will be hosted on the server on a cloud service. js under dist/ directory. Allows creating a Progressive Web Apps built with React and Node. A flexible and beautiful Select Input control for ReactJS with multiselect, autocomplete and ajax support. You can add support for other types by adding an assetExts resolver option in your Metro Bower keeps track of these packages in a manifest file, bower. Please note that bluehost doesn’t upload folder and its content. Most common file types are supported including . Scheduler. Go to the My Media/My Documents/My Photos folder. If you don’t have that file already, you just create a blank file, and put that content into it. js is an open-source JavaScript library that is used for building user interfaces specifically for single-page applications. The KendoReact Upload helps users send files from their file systems to dedicated server handlers which are configured to receive them. Basic usage. Mobile applications definitely offer a greater value to businesses than their mobile website • Fixed Spell checker not working and missing Image Advanced Edit button in Node JS SDK • Fixed Unable to load any images or files Python Flask SDK • Fixed Upload Video not working in Rail SDk • Fixed On opening an uploaded file throws "HTTP status 404-Not Found" in Java SDK • Fixed Unable to upload images in Java SDK • Fixed On opening an uploaded file throws "Template is missing The require syntax described above can be used to statically include audio, video or document files in your project as well. If nothing happens, download GitHub Desktop and try again. xml :This file contain list . com and its affiliated web properties is provided "as is" without warranty of any kind. As of Kendo UI R1 2020 SP1 the kendo. Go to the My Media/My Documents/My Photos folder. Click on the Delete button. Material Dashboard React Nodejs . ) Simple Example. 2] - 2020-02-18 React Aaded. react-dom@^16. npm install -g expo-cli Use Expo to create a React Native app that runs on iOS, Android, and web. For the former, there is a library called react-dropzone that is built with React. File Operations. This happened right after updating the code when I tried to upload some . Pass the endpointUrl to the remote file system provider object to specify the Url at which the component can access the file system items. First we need to install the dependencies for React. ui. Note: In the documentation, the package babel-plugin-styled-components is specified, as well as a . This project provides a web file manager interface, allowing you to create your own backend connector following the connector API. 3 - Upgraded React version to 17. However, it is not designed to work with SSR. target. Mvc4. There’s nothing more to add, just check out our demo to get a clear idea of what you can do with it. It’s used for handling the view layer for web and mobile apps. jsx) by right clicking on container folder script => react folder select a file from new items dialog popup and click on Add button. /data. This is where Babel macros come in. 07 August 2019. This prod Nowadays, Node Package Manager (npm) is one of the most demanded gadgets in the web developer tool belt. You can open the Task Manager by 2 options. The value should be an async function that receives a webpack config and eventually returns a webpack config. js on cPanel. Like a photo, pdf or any other file type. Along the way, we will build a massive e-commerce application similar to Shopify using React, Redux, React Hooks, React Router, GraphQL, Context API, Firebase, Redux-Saga, Stripe + more. js on cPanel. bat ) as this generates a few files required by the build (such as SharedAssemblyVersionInfo. With this in place, feel free to open the solution file in Visual Studio or VS Code. Free Web File Thunar is developed by Benedikt Meurer, and was originally intended to replace XFFM, Xfce's previous file manager. React Scheduler Storybook is an open source tool for developing UI components in isolation for React, Vue, and Angular. Ignite UI for React also includes the most complete Microsoft Excel solution and 60+ chart types with interactive panning and zooming, touch support and much more. An online file manager which can be used on its own, or as a plugin for a rich-text editor such as CKeditor, TinyMCE or FCKeditor. png extension) Let’s take a quick look at how to manage those breaking user interactions: to the . Option 1: Type "task" in the search box beside the Start menu, and press Enter when you see the "Task Manager" app. /. export const fileItems = [{ 'name': 'Documents', 'isDirectory': true, 'category': 'Work', 'items': [{ 'name': 'Projects', 'isDirectory': true, 'category': 'Work The Custom File System Provider allows you to implement custom APIs to handle file operations (add, delete, rename, etc. Initially, the selectedFilestate is set to null Next time you’re looking for a file, it’s just a click away in the file manager. This will add the LibraryManager NuGet package (Microsoft. . The Edit screen with option to select one or more files is displayed. js) as shown below. Webix suggests a ready-made solution, which is JS File manager, that can be built into any web application. Fileside Modern, tiling file manager with unlimited panes. I have a demo on Laravel + React. React, Redux, Material UI, Nodejs, ExpressJs . We use Gatsby with TypeScript for this website, so that can also be a useful reference implementation. module folders. Work fast with our official CLI. To select a file or folder: 1. log(event. Store the file in state, and only upload when a user clicks the upload button. wav, . You can then use the Dropzone component to render the HTML5 Drag What we would like to see from a project manager is the following: - A candidate that can manage: 1 - Experience with React context api . 90/5. Declaration files. It come with unlimited customized email with your domain. Looking for the best react native app development companies? Here is the list of the top React native developers with reviews by ADA. To select a specific file, you need to use the number assigned to it. Angle - Responsive Bootstrap Admin Template. Use it as a child component of you application. As with any programming problem, there are many ways to achieve this outcome. Run npm install and npm start after that. For initialising file manager you have to install and run both of them from terminal with commands . 3. 5. react-files. /. A predictable state container for JavaScript apps. xcodeproj file. JSX Now, we need to create a first component to create a file (. A simple file manager built with react. Client implementation is an npm package which can be embed into your application. Grouping by features or routes One common way to structure projects is to locate CSS, JS, and tests together inside folders grouped by feature or route. yarn add react yarn add react-dom yarn add --dev parcel-bundler. 11, React 16/17. The File Manager is a graphical user interface component used to manage the file system. mp4, . Delete a file. Web. expo-file-system ( docs) expo-media-library ( docs) After you’ve done that we can proceed. Create shortcuts for files: Hold SHIFT and move a file with drag & drop to another folder in order to create a shortcut There are several possible ways of using Webix with React: using a Webix widget in a React app; creating a custom Webix+React component; using a Webix widget with Redux; How to Start. Option 1: Package Manager. This article explains a simple way to implement the approach to upload a single file with React. Software Package Manager. View demo Download Source. Say “MORE OPTIONS” 3. Free . You can assign custom color to every folder and tag, which makes the visual search an easy step. - Fixed minor bugs. net/` // Or you React File Manager Usage (iOS) First you need to install react-native-file-manager: npm install react-native-file-manager --save. " File Manager. Step 8: Configuring AVD Manager. On the backend, we are going to use Laravel’s Storage API to store images. Now, you can start adding Essential JS 2 File Manager component to the application. 3. All basic file handling mechanisms like upload, download, read, edit, delete, search, and sort can be performed to manage and organize the files and folder in a file system. ly/3d8cXTx To learn more about the react-native visit: The FileManager UI component can work with a file system located on the server. Angular React Vue jQuery PeaZip is a free archiver tool. config. com is looking for React JS Developers for our team in Delhi/NCR (India) Employment: Permanent Employment Place of Work: Delhi/NCR (India) CTC: Best in the industry Role. Complete file and folder manager: Create, rename, move and delete a folder. Reference React. 0 / scheduler@^0. Use the fileSystemProvider property to configure the component's file system provider. Video-React is a web video player built from the ground up for an HTML5 world using React library. Let’s begin with the Redux side of things: The Redux Code Unlike the other frameworks covered in this module, React does not enforce strict rules around code conventions or file organization. Free Frontend Preset For Nodejs . That's when we got the idea to create an orthodox WEB file manager, working on the server's site, which would be able to copy between different sources with server speed and would offer: file and directory search, a disk usage analyzer (an analogue of ncdu), simple file uploading and a lot of other great stuff. In traditional HTML sites, the file upload form forces a page refresh, which might be confusing to users. Chocolatey integrates w/SCCM, Puppet, Chef, etc. The React File Manager component allows for the easy uploading and downloading of files in a Sorting. KFM – Kae’s File Manager. Webix File Manager is a ready-made SPA. " For example, I prepare a page “Commercial files” where I will put a shortcode corresponding to the folder of files uploaded in File Manager or Google Drive. React was first created by Jordan Walke, a software engineer working for Facebook. Web The JavaScript Client Library for Azure Storage enables many web development scenarios using storage services like Blob, Table, Queue, and File, and is compatible with modern browsers. npm install npm run start Design. thumbnail support for image files; built-in media player; text editor; many other features. I want to do a very simple file explorer in react that look like the one of Files for google. Multi-Selection. js, and Mongo. React Shopping Cart. More Template Epic React - HR Management Admin Template is High Resolution: Yes, Compatible Browsers: Firefox, Safari, Opera, Chrome, Edge, Compatible With: ReactJS, Bootstrap 4. These first have been selected by most active users and ranking has been given based on the most popular votes. So follow the below setups:- 1) Install the @react-native-community/checkbox package like below in your application 2) Link the module in your application 3) Import Get code examples like "usenavigate react" instantly right from your google search results with the Grepper Chrome Extension. 6. Disclaimer: The information provided on DevExpress. 0/v14. css'; import FileManager from 'devextreme-react/file-manager'; import { fileItems } from '. Since CKEditor 4. In the process the compiler strips away all function and method bodies and preserves only the signatures of the types that are exported. You have to manually create sub-folder, then upload files into that folder. React Filemanager Hello ex angular-filemanager user, this is the new version in React. The application provides an unified, natively portable, cross-platform file manager and archive manager GUI for many Open Source technologies like 7-Zip, FreeArc, PAQ, UPX. But first, here are the benefits of hosting your React. The ASP. It is developed by laborasyon on ThemeForest. In this tutorial, we will upload an image from the react js component. 80/5. React is an open-source JavaScript library developed by Facebook used for creating web frontend and UI components. 14. File Manager and Core Data: Used to save photo, video, audio, and pdf data to the ios device url sessions: Used to communicated with the server to upload the data to the Utah State Geographical Cuba admin is super flexible, powerful, clean & modern responsive bootstrap 5 admin template with unlimited possibilities. Or if you have the optional Yarn package manager installed. React & JavaScript articles. The File Manager component supports multiple selections of files and folders in a file system. Storybook - GitHub Pages angular-filemanager. It is worth noting the beautiful design, and a ready-made set of icons, which are included in the delivery. 10. Work fast with our official CLI. Beside Material-UI, we also integrated, with the same design style, over 80 React widgets and plugins. Click on the Next button you will see a System React Fixed. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. NullPointerException; TypeError: string indices must be integers – Python; valueerror: setting an array element with a sequence – Python; TypeError: a bytes-like object is required, not ‘str’ – Python Drop files, select on filesystem, copy and paste files, or add files using the API. To start the app server it will display live changes (optional) 4. 그럼 스타뜨! 배경 DCE내에서 파일 업로드 및 관리를 할수 있는 GUI 화면이 필요했다. React Scheduler Disclaimer: The information provided on DevExpress. The issue with this is that, because we’re using create-react-app, we can’t configure a lot of things unless we eject. This package support multiple files selection, cloud storage integration. Creating a file upload component is a common task in web development. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. At least two fields must be present in the definition file: name and version. 0] – 2020-11-28 HTML, HTML + Laravel ADDED All-new design based on Ul/UX principles New Bordered & Dark layout New eCommerce Dashboard Invoice Bamburgh React Admin Dashboard with Reactstrap PRO is built entirely on React and uses the popular starter kit Create React App from Facebook. Now my APP only show that green circle button located in index. e. Just be sure to follow the installation instructions for “bare” or plain react-native apps. json file. Build files will be created build. First, we install dependencies using npx then download the laravel project. 4. Download Epic React – HR Management Admin Template nulled from the below download links and if the item satisfy you then buy it from the developer puffintheme for commercial use. - Minor fixes of RTL SCSS. npm install --save react npm install --save react-dom npm install --save-dev parcel-bundler. js - TS docs; Gatsby - TS Docs; All of these are great starting points. Finally, what all this was leading up to, opening that index. I guess it's technically possible to write a file manager in Node, use React for the UI, and package it as a desktop app with Electron, but I would still not call that "React based" (and C. js in your Greg Fodor - Engineering Manager Mozilla The development team involved have been very impressed by the React Admin framework and it has been capable of handling the complex challenges we have had for it thusfar. It uses React framework and supports connectors to different file storages. Drag & Drop your files in folders: Drag & Drop and image to move it into a folder, where you can find other files. File Manager: Admin can import/export & upload new files. packages. Simple event handlers are also provided as props to the browser, which allow it to respond to actions on the files. Thus you will get an example of integration usage. onChangeHandler=event=>{ console. . It has a large UI collection. dll. onChangeHandler=event=>{ console. import React from 'react'; import ReactDOM from 'react-dom'; import { FileManager, FileNavigator } from '@opuscapita/react-filemanager'; import connectorNodeV1 from '@opuscapita/react-filemanager-connector-node-v1'; const apiOptions = { connectorNodeV1. json file is saved. API-first CMS. json file and choose “Enable Restore on Build”. FileManager also performs operations like creating a new folder, moving files, and searching. This control is part of the Telerik UI for ASP. Created from revision f160547f47 on 12/4/2020. All APIs that implement access to Azure Blob Storage on the client are stored in the azure-file-system. banzay/friends-app-redux Second take on friends app. 4 - Creating RESTful services with Package Manager stores application information in three files, located in /data/system. Is the Excel Viewer widget compatible with the Webix community (opensource) edition? PHP & MySQL Projects for $2 - $10. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8192eu_new. d. Select the file to upload from the file selector dialog box; Downloading a file. pdf. NET Core FileManager lets your users browse through directories and files, akin to file managers like Windows Explorer, and manage file storage within their web applications. spatie/laravel-medialibrary Released: August 2015 Installs: 178 000 Last update: May 2017 (1 day ago). Note the command dotnet new react; this is the template I’m using for this React project. Your domain will look in this ‘public_html’ folder for a top ‘index. You'll see a folder named AppCenterReactNativeShared which contains a single framework for the required React Native iOS bridge. Be it a web-based gaming experience where you store state information in the Table service, uploading photos to a Blob account from a Mobile app, or an entire CodeSandbox at its core consists of two parts: the editor and the preview. Predefined connectors are: Client React connector for Server Node API v1 Localization in React FileManager component The file manager can be localized to any culture by defining the texts and messages of the file manager in the corresponding culture. babelrc file present in the application root folder. jpeg extension) Uploading an image where the file extension has been intentionally changed and Cloudinary could process it, but the DOM could not render the file (eg. Web based File Manager Manage files online From within the free control panel, an easy to use File Manager helps you to upload files, download files or even edit HTML, PHP or other programming language files. 2. The content of package. Create a new file called manager. All basic file operations like creating a new folder, uploading and downloading of files in the file system, and deleting and renaming of existing files and folders are available in the file manager component. Used technologies. askwon/Filet-Manager Web-based file transfer client written in React, Redux, and Go; ayxos/react-cellar Typescript, MongoDb, Webpack, EC6, Typings, Redux Wine-Cellar; azu/read-all-later [Electron] Read All Later is a client for Pocket. - Added New Auth pages. To run the service, create an Amazon S3 account and a S3 bucket and then register your amazon S3 client account details like bucketName, awsAccessKeyId, awsSecretKeyId and awsRegion details in RegisterAmazonS3 method to perform the file operations. 9. 5. Angle is an admin template based on Bootstrap and multiple frameworks. Personalize your React grid with flexible API. Organizing your blog media files with the Real Media Library plugin is as easy as dragging and dropping them into folders. JavaScript File Manager or in other words File Explorer is a web widget, part of the UI framework for managing files. Developed at Facebook and released to the world in 2013, it drives some of the most widely used apps, powering Facebook and Instagram among countless other applications. light. . Here native file viewer means we are not going to view the file in our application instead we will pick the file from file picker and will pass the file URL to FileViewer component provided by react-native-file-viewer, this component will trigger the native iOS/Android file viewer to open the file. Say “MORE OPTIONS” 3. ). To delete one or more files, 1. To download a remote file’s content to a local file on the device, here’s the code: Hi Dev, In this blog, I will show you how to install file manager package in laravel application. Install Step 1 npm i react-native-azure-blob-storage-manager --save Step 2 Dependencies npm install --save react-native-background-upload iOS cd ios pod install Manual Installation Installation. Node. Use Git or checkout with SVN using the web URL. Page 16. Documentation. We are going to use react-dropzone to build an image uploader. Hello ex angular-filemanager user, this is the new version in React. react file manager +your&theme& +set&new feature&AGGREGATION +potential j and css ) ideal&HTML5 and Bootstrap&AGGREGATION +component&dashboard template&AGGREGATION +Redux&application&依赖 +It&beautiful design&依赖 +LOT&components and feature&AGGREGATION +you&live preview&依赖 +it&LOT&依赖 +it&components and feature&依赖 +filemanager namespace&FileManagerCommand class&依赖 +Module file&multi-pane module editor&依赖 +Module file&design manager&依赖 +include&App&依赖 +include&ej2-react-filemanager package&依赖 +include&FileManagerComponent&依赖 +1&4&依赖 +1&Bootstrap version&依赖 +1&Bootstrap version&依赖 +1&4&依赖 +download manager&HTTP connection&依赖 +I&shortcode&依赖 +Mobile application&greater value&依赖 +we&package&依赖 +we&package&依赖 +Mobile application&business&依赖 +their&website& +Mobile application&mobile website&依赖 +Storybook&setup& +your&self& +You&webpack setup&依赖 +your&opinions& +webpack setup&setup&GENERALIZATION +import&dx& +import React&React&GENERALIZATION +your&needs& +dependency&itself&依赖 +you&file upload component&依赖 +top&list&AGGREGATION +order&file&AGGREGATION +your&files& +You&file&依赖 +You&order&依赖 +react-overlay&them&依赖 +you&them&依赖 +aspect&target&AGGREGATION +net core suite&100 + fully-featured UI component&依赖 +net core suite&100 + fully-featured UI component&依赖 +net core suite&100 + fully-featured UI component&依赖 +You&file or folder&依赖 +left&file or folder&AGGREGATION +You&left&依赖 +You&symbol&依赖 +its&properties& +warranty&kind&AGGREGATION +heart&smartphone&AGGREGATION +file manager application&smartphone&依赖 +rest&page&AGGREGATION +" trap " focus&keyboard navigation cycle&依赖 +Modal&focus& +Overview&Kendo UI FileManager&AGGREGATION +I&react&依赖 +your&collaboration& +my&tasks& +your&JavaScript& +your&url& +current url&Drupal&AGGREGATION +Syncfusion&Studio& +our&model& +file&a dot )&依赖 +your&manager& +dot file&file&GENERALIZATION +file&file manager&依赖 +it&hidden file&依赖 +you&dot file&依赖 +autoFocus&modal&依赖 +autoFocus&vanilla Bootstrap&依赖 +sample&Amazon S3 file system provider&依赖 +React&reusable UI component&依赖 +React&us&依赖 +SVN&web URL&依赖 +you&look&依赖 +you&file input&依赖 +look&file input&AGGREGATION +you&form&依赖 +your&design& +Chocolatey&business&依赖 +Chocolatey&software deployment&依赖 +default image const [ defaultuserimage react filemanager&Specify&依赖 +svg '&file input const imageref = useref&依赖 +default image const [ defaultuserimage react filemanager&Specify&依赖 +default image const [ defaultuserimage react filemanager&default image const [ defaultuserimage react filemanager&依赖 +default image const [ defaultuserimage react filemanager&default image const [ defaultuserimage react filemanager&依赖 +svg '&file input const imageref = useref&依赖 +your&path& +FTP Access Upload file&easier and faster way to upload and download&依赖 +FTP Access Upload file&easier and faster way to upload and download&依赖 +default locale&file manager&AGGREGATION +our&manager& +It&kendo-react-upload package&依赖 +It&NPM&依赖 +It&standalone app&依赖 +Python dictionary&key&依赖 +Python dictionary&performance&依赖 +React ’s implementation&popular drag and drop library&AGGREGATION +react-dropzone&file uploading&实现 +react-dropzone&popular drag and drop library&实现 +our&opinion& +Webix library&opinion&依赖 +Webix library&best solution&依赖 +we&filemanager (&依赖 +we&File Server Node API component&依赖 +we&opuscapita )&依赖 +your&application& +You&span&依赖 +You&grid element&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +flat keyed list&object&AGGREGATION +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&folder base file browser&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&flat keyed list&依赖 +folder base file browser&flat keyed list&依赖 +client react connector&Google Drive API v2&依赖 +client react connector&Google Drive API v2&依赖 +scaffold&what&依赖 +You&project&依赖 +you&right&依赖 +ES7 syntax Another file format&MD file extension&依赖 +specify camera icon&button text import camera&依赖 +React Native This&lightweight and easy-to-use password manager Clonezilla&依赖 +exact mirror&React Native project&AGGREGATION +React Native This&React Native project&依赖 +operating system&file manager&依赖 +operating system&required file&依赖 +Create React App&React Application&依赖 +webpack configuration&you&依赖 +Storybook&configuration& +their&files& +customer&static file&依赖 +one&admin tool&AGGREGATION +customer&shared host&依赖 +our&customers& +null filebrowser dialog&FileBrowser object&依赖 +let ’s&Bootstrap and React&依赖 +My&folders& +My&Documents& +You&Media&依赖 +You&file&依赖 +your&React& +flag add&dependency&依赖 +one&two way&AGGREGATION +Run bower&&依赖 +Run bower&save&依赖 +flag add&dependency&依赖 +partition and disk&React debugging tool&依赖 +partition and disk&Chrome Developer Tools&依赖 +FileBrowser dialog&object previewer/property manager&依赖 +FileBrowser dialog&FileBrowser object&依赖 +them&individual contributor role&依赖 +them&manager role&依赖 +json&JSON&依赖 +It&collaboration&依赖 +It&my development task and time&依赖 +It&me&依赖 +template and NOT&final purchase file&AGGREGATION +part&template and NOT&AGGREGATION +They&template and NOT&依赖 +file&above import&依赖 +right click Libraries Add&Go&依赖 +project&Go& +your&project& +an html5 drag & drop file uploader&an html5 drag & drop file uploader&依赖 +I&somebody&依赖 +you&‘ public_html ’ folder&依赖 +you&domain&依赖 +you&file manager&依赖 +Your&first& +build script ( dev-build&build script ( dev-build&依赖 +Manager&icons& +) wrap long file name&Folder Tree&依赖 +) wrap long file name&icon&依赖 +) wrap long file name&) wrap long file name&依赖 +com&working&依赖 +com&you&依赖 +com&opportunity&依赖 +You&flexible ReactJS component&依赖 +create-react-app&files [ 0 ] ) }&依赖 +create-react-app&browser&依赖 +list&top react native developer&AGGREGATION +I&File Manager&依赖 +I&file&依赖 +popular open-source JavaScript library&you&AGGREGATION +project&build&依赖 +part&project&AGGREGATION +project&part&依赖 +project&restore&依赖 +json file&Build ”&依赖 +json file&“ Enable Restore&依赖 +json file&Build ”&依赖 +json file&“ Enable Restore&依赖 +lot&people&AGGREGATION +file&module-name&依赖 +upload react website&“&AGGREGATION +subdomain open file manager&new folder&依赖 +upload react website&” folder&依赖 +I&Kendo UI File Manager&依赖 +I&custom command&依赖 +Restore&client-side library&依赖 +Restore&libman&依赖 +Restore&libman&依赖 +Restore&client-side library&依赖 +user&own directory&依赖 +It&multiple user&依赖 +creation&multiple user&AGGREGATION +It&creation&依赖 +its&directory& +we&task manager application&依赖 +it&previewer&依赖 +DevExtreme JavaScript FileManager component&you&依赖 +DevExtreme JavaScript FileManager component&files and directory&依赖 +I&addition&依赖 +I&React JSX file&依赖 +corresponding online repository&package&AGGREGATION +time&top&依赖 +time&GCS cloud storage bucket&依赖 +top&GCS cloud storage bucket&AGGREGATION +time&Dropbox-like functionality&依赖 +time&Dropbox-like functionality&依赖 +time&GCS cloud storage bucket&依赖 +time&top&依赖 +you&standard file input HTML element&依赖 +Binding&UI element&AGGREGATION +React&modern web application&依赖 +one&best choice&AGGREGATION +React&best choice&依赖 +React&slim apus&依赖 +installation file&it&AGGREGATION +KendoReact Upload component&KendoReact library&依赖 +KendoReact library&React UI component&AGGREGATION +KendoReact Upload component&React UI component&依赖 +part&KendoReact library&AGGREGATION +your&analytics& +you&analytic&依赖 +React component&component&GENERALIZATION +new React component&JavaScript and TypeScript&依赖 +file caching system&two main part&依赖 +Our&system& +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +its&system& +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +tailor fman&powerful plugin system&依赖 +React&single button&依赖 +app&interface& +React&few piece&依赖 +React&interface&依赖 +few piece&interface&AGGREGATION +requirement&File Upload Component&依赖 +requirement&React&依赖 +file manager&local computer&依赖 +npm&save&依赖 +React&web&依赖 +React&frontend library&依赖 +your&computer& +npm&react-file&依赖 +reacto&x86/x64 personal computer&依赖 +npm start&build&依赖 +npm start&a new&依赖 +system service&long-running download&依赖 +system service&handling&依赖 +system service&long-running download&依赖 +system service&handling&依赖 +handling&long-running download&AGGREGATION +RNFetchBlob&functionality& +method&you&依赖 +method&Windows Explorer&依赖 +its&features& +list&key feature&AGGREGATION +md file store transaction&md file store transaction&依赖 +second be&Redux store&依赖 +set&actions and reducer&AGGREGATION +second be&Redux store&依赖 +second be&actions and reducer&依赖 +Webix widgets and application&Angular or React environment&依赖 +integrate tinymce editor&Laravel&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&Laravel&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&Laravel&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&Laravel&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +integrate tinymce editor&image upload jquery php july 27 , 2019 2,006 view&依赖 +image upload jquery php july 27 , 2019 2,006 view&Sortable&依赖 +integrate tinymce editor&Laravel&依赖 +integrate tinymce editor&Laravel&依赖 +we&animation&依赖 +we&File Manager App uus&依赖 +we&React Native&依赖 +You&high end protection&依赖 +It&file manager&依赖 +you&React&依赖 +you&reusable component&依赖 +table&file manager&依赖 +table&default texts and message&依赖 +table&en culture&依赖 +default texts and message&file manager&AGGREGATION +en culture&culture&GENERALIZATION +modern toolchain&React developer&AGGREGATION +this project base course&this project base course&依赖 +user&image and video and etc and Instagram&依赖 +user&example&依赖 +repo&page& +your&repo& +your&repository& +benefit&React&AGGREGATION +folder&creation&GENERALIZATION +full list&compatible frameworks and integration example&AGGREGATION +Files and folder&npm install&依赖 +part&default react-native init installation&AGGREGATION +other file & folder and some truncate )&default react-native init installation&依赖 +our&focus& +other file & folder and some truncate )&default react-native init installation&依赖 +you&other file&依赖 +Accessible&Keyboard&依赖 +Accessible&Keyboard&依赖 +own React&Browserify&依赖 +own React&process (&依赖 +npm&package manager&依赖 +npm&Node&依赖 +multiple package&package&依赖 +Bower&jQuery&依赖 +multiple package&example&依赖 +You&two option&依赖 +You&file uploader&依赖 +reacto&computer&依赖 +your&tools& +free , opensource reimplementation&windows relate&AGGREGATION +Paper Kit reacto&windows relate&实现 +Developer Express Inc&warranty&依赖 +warranty&merchantability and fitness&AGGREGATION +Developer Express Inc&warranty&依赖 +Removed complementary design file&[ 5&依赖 +tagspace feature basic file management operation&tagspace feature basic file management operation&依赖 +it&simple file manager&依赖 +dhtmlxGrid&rich API functionality&依赖 +unparalleled array&feature&AGGREGATION +Asus file manager&file manager&GENERALIZATION +part&dhtmlxSuite library&AGGREGATION +world&grid& +last post 2 hour&last post 2 hour&依赖 +net core&RSS 0&依赖 +last post 2 hour&last post 2 hour&依赖 +last post 2 hour&fiazahmed an electron base file manager&依赖 +net core&RSS 0&依赖 +net core&RSS 0&依赖 +last post 2 hour&fiazahmed an electron base file manager&依赖 +It&visual file manager&依赖 +it&function&依赖 +your&app& +set&function&AGGREGATION +it&set&依赖 +presence&drag & drop responsiveness&依赖 +presence&handler&AGGREGATION +newer version&program use&AGGREGATION +FileManager&file system provider&依赖 +you&opinion&依赖 +you&folder&依赖 +fastlane&folder&依赖 +you&file&依赖 +you&file&依赖 +Firebase issue&issue&GENERALIZATION +Firebase issue&[ 5&依赖 +Firebase issue&[ 5&依赖 +create-react-app utility&tool&依赖 +create-react-app utility&Babel and webpack&依赖 +create-react-app utility&Babel and webpack&依赖 +create-react-app utility&tool&依赖 +we&Client React component&依赖 +Client React component&React component&GENERALIZATION +our&library& +our&detail& +nyc_output and coverage folder&instrumentation detail&依赖 +React admin dashboard&frontend framework&依赖 +React admin dashboard&material-uus&依赖 +term&frontend framework&AGGREGATION +React admin dashboard&term&依赖 +It&accessing&依赖 +It&user&依赖 +It&common file operation&依赖 +sample&Android 4 ICS emulator image&依赖 +smartphone&file manager&依赖 +smartphone&view&依赖 +npm package&call package&依赖 +npm package&file&依赖 +you&number&依赖 +see bundler default&full list&依赖 +see bundler default&full list&依赖 +Bower&front-end&依赖 +It&file&依赖 +Cezerin&open-source ecommerce platform&依赖 +your&component& +you&Social Sites&依赖 +your&site& +9 all file upload&a json response (&依赖 +9 all file upload&a json response (&依赖 +curriculum&introspection&依赖 +curriculum&building skill&依赖 +curriculum&addition&依赖 +Plugin&Front-end & Back-end&依赖 +Plugin&two part&依赖 +re-run&package manager&AGGREGATION +your&changes& +user&upload button&依赖 +this package help&file and assest&依赖 +this package help&you&依赖 +your&service& +you&large files/streaming&依赖 +you&Android Download Manager&依赖 +help&Suite component&AGGREGATION +React component&yarn or npm&依赖 +you&yourself&依赖 +you&few step&依赖 +app&file& +class App&App&GENERALIZATION +class App&React&依赖 +use npm&Expo CLI command line utility&依赖 +you&package&依赖 +html file&code coverage&依赖 +html file&code coverage&依赖 +our&coverage& +bootstrap npm&axio&依赖 +react-toastify&beautiful notification&依赖 +We&react project react-file-upload – cd react-file-upload&依赖 +bootstrap npm&npm&GENERALIZATION +bootstrap help&uus&依赖 +CKEditor 4&external file manager and ( file browser/uploader&依赖 +CKEditor 4&( file browser/uploader&依赖 +smart filemanager&Material Design style&依赖 +smart filemanager&Jonas Sciangula Street&依赖 +smart filemanager&Material Design style&依赖 +smart filemanager&Material Design style&依赖 +smart filemanager&angularj&依赖 +smart filemanager&Jonas Sciangula Street&依赖 +smart filemanager&angularj&依赖 +smart filemanager&Jonas Sciangula Street&依赖 +smart filemanager&angularj&依赖 +page ( module instance&page )&依赖 +file&page )&依赖 +file&page&依赖 +file&individual rendered module&依赖 +instance&module&AGGREGATION +your&server& +website&page& +your&website& +You&custom thumbnail and text description&依赖 +You&file or folder&依赖 +Drag and Drop Support&files or folder&依赖 +React FileManager component 23 Feb 2021 / 1 minute&file manager&依赖 +your&program& +you&it&依赖 +your&Panel& +you&folder&依赖 +Panel&manager& +file&actual file&依赖 +its&details& +file&actual file&依赖 +rest¬hing ( white page )&依赖 +rest&website&AGGREGATION +React File Manger Multi-column File Manager&react-beautiful-dnd&依赖 +js developers one&js developers one&依赖 +React and TypeScript&development&实现 +development&visual interface&AGGREGATION +React and TypeScript&visual interface&实现 +/ /&path / / Specify&依赖 +/ /&it&依赖 +/ /&/&GENERALIZATION +Specify&defaultUser& +your&defaultUser& +your&Specify& +software management automation&installer&依赖 +Chocolatey&Windows&依赖 +Files file&cloud service&依赖 +Files file&server&依赖 +a flexible and beautiful select input control&reactj&依赖 +a flexible and beautiful select input control&reactj&依赖 +a flexible and beautiful select input control&reactj&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&reactj&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +a flexible and beautiful select input control&multiselect , autocomplete and ajax support&依赖 +assetExts resolver option&package&AGGREGATION +Metro Bower keeps track&package&AGGREGATION +You&other type&依赖 +your&track& +You&support&依赖 +its&content& +please note&folder&依赖 +you&file&依赖 +you&blank file&依赖 +their&systems& +KendoReact Upload&file system&依赖 +KendoReact Upload&file&依赖 +Template&require syntax&依赖 +uploaded file&Java SDK • fix&依赖 +their&checker& +uploaded file&http status 404-not found "&依赖 +uploaded file&upload image&依赖 +uploaded file&java sdk • fixed unable&依赖 +kendo ui r1&kendo&依赖 +kendo ui r1&kendo ui r1&依赖 +npm&g expo-cli Use Expo&依赖 +npm&React Native app&依赖 +component&Url&依赖 +component&file system item&依赖 +we&dependency&依赖 +package babel-plugin-styled-component&documentation&依赖 +your&connector& +you&it&依赖 +you&what&依赖 +our&demo& +you&it&依赖 +you&what&依赖 +container folder script = > react folder&file&依赖 +container folder script = > react folder&file&依赖 +container folder script = > react folder&new item dialog popup&依赖 +container folder script = > react folder&new item dialog popup&依赖 +container folder script = > react folder&new item dialog popup&依赖 +container folder script = > react folder&new item dialog popup&依赖 +container folder script = > react folder&file&依赖 +container folder script = > react folder&file&依赖 +node package manager ( npm )&web developer tool belt&依赖 +one&most demanded gadget&AGGREGATION +node package manager ( npm )&most demanded gadget&依赖 +You&Task Manager&依赖 +You&2 option&依赖 +async function&webpack config&依赖 +we&way&依赖 +we&massive e-commerce application&依赖 +Free Web File Thunar&Benedikt Meurer&依赖 +Xfce&manager& +Ignite UI&most complete Microsoft Excel solution&依赖 +Ignite UI&most complete Microsoft Excel solution&依赖 +its&own& +png extension ) Let&quick look&依赖 +you&" Task Manager " app&依赖 +Work '&export const fileitem = [ { ' name '&依赖 +Custom File System Provider&custom api&实现 +Custom File System Provider&you&实现 +export const fileitem = [ { ' name '&export const fileitem = [ { ' name '&依赖 +Work '&[ { ' name '&依赖 +{&name& +it&click&依赖 +it&file manager&依赖 +Webix&ready-made solution&依赖 +I&Laravel + React&依赖 +I&demo&依赖 +We&TypeScript&依赖 +We&website&依赖 +We&Gatsby&依赖 +our&CLI& +we&what&依赖 +You&Dropzone component&依赖 +Dropzone component&component&GENERALIZATION +your&domain& +child component&you application&AGGREGATION +many way&programming problem&依赖 +many way&programming problem&依赖 +we&file (&依赖 +we&first component&依赖 +npm package&package&GENERALIZATION +One common way&CSS&依赖 +yarn&react-dom yarn&依赖 +React&component&GENERALIZATION +article&simple way&依赖 +article&approach&实现 +You&custom color&依赖 +You&folder and tag&依赖 +net /&/&依赖 +we&Laravel ’s Storage API&依赖 +you&essential j 2 file manager component&依赖 +you&application&依赖 +all basic file handling mechanism&all basic file handling mechanism&依赖 +all basic file handling mechanism&all basic file handling mechanism&依赖 +all basic file handling mechanism&upload , download , read , edit , delete , search&依赖 +all basic file handling mechanism&upload , download , read , edit , delete , search&依赖 +FileManager UI component&file system&依赖 +com&team&依赖 +com&delhi/ncr ( india ) employment&依赖 +com&delhi/ncr ( india ) ctc&依赖 +our&team& +com&React JS Developers&依赖 +Permanent Employment Place&Work&AGGREGATION +component&provider& +React&code convention&依赖 +React&strict rule&依赖 +Redux side&thing&AGGREGATION +site&different source&依赖 +we&orthodox WEB file manager&依赖 +analogue&ncdu )&AGGREGATION +we&idea&依赖 +we&working&依赖 +server&site& +lot&other great stuff&AGGREGATION +file upload form&traditional HTML site&依赖 +file upload form&page refresh&依赖 +Chocolatey&w/SCCM&依赖 +React File Manager component&easy uploading and downloading&依赖 +easy uploading and downloading&file&AGGREGATION +React File Manager component&file&依赖 +React File Manager component&File Manager component&GENERALIZATION +React File Manager component&file in ###&依赖 +I&a page “ commercial file&依赖 +I&file&依赖 +I&example&依赖 +I&shortcode correspond&依赖 +I&shortcode correspond&依赖 +folder&file&AGGREGATION +React&software engineer&依赖 +React&Jordan Walke&依赖 +React&working&依赖 +npm&npm run start Design&依赖 +I&react&实现 +I&simple file explorer&依赖 +one&file&AGGREGATION +your&results& +compiler strip&process&依赖 +signature&type&AGGREGATION +compiler strip&function and method body&依赖 +compiler strip&process&依赖 +compiler strip&function and method body&依赖 +You&sub-folder&依赖 +React Filemanager Hello ex angular-filemanager user and&React&依赖 +It&laborasyon&依赖 +It&ThemeForest&依赖 +we&image&依赖 +we&tutorial&依赖 +we&react js component&依赖 +you&optional Yarn package manager&依赖 +File Manager component&files and folder&依赖 +File Manager component&multiple selection&依赖 +File Manager component&files and folder&依赖 +multiple selection&files and folder&AGGREGATION +File Manager component&multiple selection&依赖 +a ready-made set&icon&AGGREGATION +It&beautiful design&依赖 +we&80 react widget and plugin&依赖 +string index&sequence – Python&依赖 +string index&array element&依赖 +it&live change&依赖 +issue&thing&依赖 +issue&lot&依赖 +issue&thing&依赖 +issue&lot&依赖 +lot&thing&AGGREGATION +this package support multiple file selection&cloud storage integration&依赖 +this package support multiple file selection&this package support multiple file selection&依赖 +this package support multiple file selection&this package support multiple file selection&依赖 +this package support multiple file selection&cloud storage integration&依赖 +least two field&name and version&依赖 +least two field&definition file&依赖 +definition file&file&GENERALIZATION +new bordered & dark layout new ecommerce dashboard invoice bamburgh react admin dashboard&React&依赖 +0 ] – 2020-11-28 HTML , HTML + Laravel ADDED All-new design&new bordered & dark layout new ecommerce dashboard invoice bamburgh react admin dashboard&依赖 +0 ] – 2020-11-28 HTML , HTML + Laravel ADDED All-new design&Reactstrap PRO&依赖 +APP&green circle button&依赖 +my&APP& +we&dependency&依赖 +we&npx&依赖 +Download Epic React – HR Management Admin Template&commercial use&依赖 +item&you&依赖 +Download Epic React – HR Management Admin Template&download link&依赖 +Download Epic React – HR Management Admin Template&developer puffintheme&依赖 +Minor fix&RTL SCSS&AGGREGATION +npm&react-dom npm install&依赖 +your&Fodor& +It&React framework&依赖 +File Manager&upload&依赖 +File Manager&new file&依赖 +browser&it&依赖 +example&integration usage&AGGREGATION +you&example&依赖 +you&integration usage&依赖 +It&large UI collection&依赖 +import React&import { filemanager , filenavigator }&依赖 +import React&react-filemanager&依赖 +/&react-filemanager&GENERALIZATION +FileManager&operation&依赖 +control&Telerik UI&依赖 +control&ASP&依赖 +part&Telerik UI&AGGREGATION +api&azure-file-system&依赖 +api&client&实现 +api&access&实现 +api&Azure Blob Storage&实现 +banzay/friends-app-redux second take&friends app&依赖 +Excel Viewer widget&) edition webix community ( opensource&依赖 +NET Core FileManager&user&依赖 +your&users& +their&applications& +domain&‘ public_html ’ folder&依赖 +Your&domain& +domain&top ‘ index&依赖 +folder&required React Native iOS bridge&依赖 +You&folder&依赖 +folder&single framework&依赖 +you&Table service&依赖 +you&state information&依赖 +web-based gaming experience&two part&依赖 +its&core& +file manager&culture&依赖 +Predefined connector&Server Node API v1 Localization&依赖 +Predefined connector&React FileManager component&依赖 +texts and message&file manager&AGGREGATION +babelrc file&present&依赖 +Cloudinary&it&依赖 +DOM&file ( eg&依赖 +File Manager&you&依赖 +content&package&AGGREGATION +basic file operation&file manager component&依赖 +basic file operation&file manager component&依赖 +all later&Pocket&依赖 +your&client& +your&grid& +part&UI framework&AGGREGATION +component&native iOS/Android file viewer&依赖 +our&application& +we&file&依赖 +we&file picker&依赖 +I&laravel application&依赖 +file&content& +I&file manager package&依赖 +I&you&依赖 +Install Step 1 npm i react-native-azure-blob-storage-manager&Manual Installation Installation&依赖 +We&react-dropzone&依赖 diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-simEnts.txt b/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-simEnts.txt deleted file mode 100644 index 600b501136a12f398d7b5d1d04cd146a9e6d4d6e..0000000000000000000000000000000000000000 --- a/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager-simEnts.txt +++ /dev/null @@ -1 +0,0 @@ -react file manager All components included in this dashboard template has been developed to bring all the potential of HTML5 and Bootstrap plus a set of new features (JS and CSS) ideal for your next dashboard admin theme or admin web application project. 확장성을 보유할것 외부 프로젝트에서도 Description. Redux helps you write applications that behave consistently, run in different environments (client, server, and native), and are easy to test. It has a beautiful design, as you can see from the live previews and it contains a LOT of components and features. filemanager namespace exposes the FileManagerCommand class that could be extended to implement a custom File Manager command. Just display a list within 2 predifined tabs (folders). Create React App - TS docs; Next. The Edit screen with option to select one or more files is displayed. Add Start script to package. 9,676 4. Inbuilt Search textbox in FileManager: See Also. Module files are represented in the design manager in a multi-pane module editor. To include the File Manager component in application import the FileManagerComponent from ej2-react-filemanager package in App. 1 - 28 November 2020 ----- - Upgraded Bootstrap version to 4. Filemanager with React & Nodejs . prod. The download manager handles HTTP connections, monitors connectivity changes, reboots, and ensures each download completes successfully. I would like this shortcode to be dynamic, i. Grab the demo from Github if you haven't done this yet. Mobile applications definitely offer a greater value to businesses than their mobile website In this tutorials we will use a package named @react-native-community/checkbox to add checkboxes in react native. Express your opinions freely and help others including your future self You can customize Storybook's webpack setup by providing a webpackFinal field in . 4. 3. import React from 'react'; import 'devextreme/dist/css/dx. js, Express and TypeScript. Install the React components and choose a theme that suits your needs. target. 2 - Ability to translate Wireframes and PSD Designs into functional web apps using HTML5, React , Node. In this tutorial you will learn how to create a working file upload component with react from scratch using no dependencies other than react itself. 2/6. /. You can rearrange the order of your files by dragging them around to move the important files to the top of the list for faster access. Run the Drupal Page having React Nested modals aren’t supported, but if you really need them the underlying react-overlays can support them if you're willing. Download the corresponding App Center SDK for iOS frameworks provided as a zip file and unzip it. NET Core suite along with 100+ fully-featured UI components designed to speed up delivery & improve every aspect of target. You’ll see a plus symbol to the left of the file or folder. com and its affiliated web properties is provided "as is" without warranty of any kind. An electron based file manager. The file manager application is like the heart of a smartphone. Modal's "trap" focus in them, ensuring the keyboard navigation cycles through the modal, and not the rest of the page. Async uploading with AJAX, or encode files as base64 data and send along form post. Overview of Kendo UI FileManager; Sort in Kendo UI FileManager; Toolbar Commands in Kendo UI FileManager Express your opinions freely and help others including your future self I am a beginner in react. LibraryManager. Developed with the latest jQuery plugins. html and . Complete file and folder manager: Create, rename, move and delete a folder. It's very important for me your collaboration on my development tasks and time. Test your JavaScript, CSS, HTML or CoffeeScript online with JSFiddle code editor. Go to react_code\src and change the apiUrl inside config. js as per your current url of Drupal. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8192ee_new. Say “EDIT MODE”. 1. To enable profiling in production mode, modify Webpack configuration file (config/webpack. To download and start utilizing Syncfusion's Essential Studio for React components, see our pricing model. A simple file manager built with react. azurewebsites. 0. Vue. Updated laravel 7 to all full version and starter-kit; React Fixed. mp4 videos to the server. Learn to build modern web applications using Angular, React & Vue! File Upload Component with Vue. A file input (dropzone) management component for React. TIP: If you have never seen a dot file (a file starting with a dot) it might be odd at first because that file might not appear in your file manager, as it’s a hidden file. Unlike vanilla Bootstrap, autoFocus works in Modals because React handles the implementation Free download Filedash – File Manager Dashboard Nulled. Maybe later i can have a button to have the grid view File-Manager 개발정의서 들어가며 본 문서는 인수인계 목적이 아닌 개발이 완료된 제품에 대한 이해를 돕기위해 제작된 개발 정의서입니다. Say for instance that you want to open the file select dialogue for a user to select an file to upload. This sample demonstrates how to utilize the Amazon S3 file system provider to manage the files in File Manager component. Deploy Trillo File Manager from GCP Marketplace In this tutorials we will use a package named @react-native-community/checkbox to add checkboxes in react native. Choose a device definition, Nexus 5X is suggestable. Create a new project with React Native. React also allows us to create reusable UI components. Use Git or checkout with SVN using the web URL. bs4 File Manager. v6. js . Source code: https://bit. Learn more . log(event. Also, you might want to customize the look of the file input in the form to make it resonate with your overall app design. Chocolatey is trusted by businesses to manage software deployments. Conclusion Let’s work that out. Scheduler. svg'; // replace it with your path // Profile upload helper const HandleImageUpload = => { // we are referencing the file input const imageRef = useRef(); // Specify the default image const [defaultUserImage React Filemanager. Download Nulled Filedash – File Manager Dashboard. 1 - Added new Scrollable layout. x, Columns: 4+. FTP Access Upload files via FTP Need easier and faster way to upload and download. The default locale of the file manager is en (English). In our editor / file manager we should see a . It is distributed through NPM under the kendo-react-upload package. It was initially called Filer but was changed to Thunar due to a name clash. It can be used as a standalone app or as a middleware. I will try to make it clean and retro-compatible with the previous bridges/connectors. When it comes to both of these issues, React can help you provide a better user experience. Themes and Skinning JavaScript - jQuery, Angular, React, Vue React Data Grid. Build files will be created build. Select, Copy, Paste, and Delete. Python dictionary add, delete, update, exists keys with performance; java. react-dropzone is a React’s implementation of popular drag and drop library for file uploading. And in our opinion, the Webix library offers the best solution available on the market. Free bootstrap snippets, examples and resources built with html, css and js. Adding React File Manager for PDF Library In the previous section, we added the File Server Node API component from filemanager (by -OpusCapita) . So follow the below setups:- 1) Install the @react-native-community/checkbox package like below in your application 2) Link the module in your application 3) Import Free React Design System For Bootstrap 4 (reactstrap) 9,824 4. You can add spans to any grid element, fine-tune the table sizes, specify the columns’ auto width, and freeze one or more columns. Folder based file browser given a flat keyed list of objects, powered by React. 10. Client React connector for Google Drive API v2; Detailed documentation for each package is coming soon. You can fire up the project with dotnet run to see what the scaffold does for you. /assets/images/defaultUser. It is fully responsive, built with Bootstrap 4 Framework, HTML5, CSS3 and SCSS. The editor is the whole CodeSandbox application (file manager, code editor, dependency settings) and the preview is the result you see on the right. Extension for Visual Studio Code - Simple extensions for React, Redux and Graphql in JS/TS with ES7 syntax Another file format that uses the MD file extension is Moneydance Financial Data. import React, { useEffect, useRef, useState } from 'react'; // Specify camera icon to replace button text import camera from '. - Added Blog List, Blog Grid, Blog Details pages. Site Navigation and Layout. React Native This is an exact mirror of the React Native project, A lightweight and easy-to-use password manager Clonezilla. All the operating systems got a file manager to filter the required files. Create React App – How to Create and Deploy a React Application to Production. and we can drill down into various modules. 5. 0 To do so, right-click the libman. By default, Storybook's webpack configuration will allow you to: Import Images and other static files Semantic UI React provides React components while Semantic UI provides themes as CSS stylesheets. mov, . (Ex – Facebook, Twitter and Google. Then add the File Manager component as shown in below code example. Web. View . This is one of the admin tools that our customers manage their static files on shared host. Initially, the selectedFilestate is set to null The FileBrowser dialogs consist of a FileBrowser object, an object previewer/property manager and a file uploader tab. Let’s install Bootstrap and React. ” - [source] You can delete the files from My Media, My Documents or My Photos folders. Managing your React. Site Navigation and Layout. This can be done in one of two ways: Run bower install --save for each package (the --save flag adds the dependencies (name and version) to the bower. fThe file is called : "FirstReactApp. bs4 File Manager. A partition and disk Adds React debugging tools to the Chrome Developer Tools. This is a Sample React Plugin for Apache OODT File Manager. The FileBrowser dialogs consist of a FileBrowser object, an object previewer/property manager and a file uploader tab. Thunar is designed to start up faster and be more responsive than some other Linux file managers, such as Nautilus and Konqueror. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8812ae_new. This is an example file with default selections. new file manager windows 10 Executive Summary These course materials were originally designed for Google managers to help them transition from an individual contributor role to a manager role. jsx". dll and React. filebrowser provides a file managing interface within a specified directory and it can be used to upload, delete, preview, rename and edit your files. Files. json must be written in JSON. I will try to make it clean and retro-compatible with the previous bridges/connectors It's very important for me your collaboration on my development tasks and time. They are not part of the template and NOT included in the final purchase files. So in the above imports, the files would be CartTotal. In XCode, in the project navigator, right click Libraries Add Files to [your project's name] Go to node_modules react-native-file-manager and add the . LibraryManager. You can go for either an HTML5 drag & drop file uploader or use the traditional way. json file is saved. I want somebody who can redo what has been done and finish it. On the file manager for a domain you have a ‘public_html’ folder. dll (if using MVC 4) in your Web Application project Your first build always needs to be done using the build script ( dev-build. The FileBrowser provides the ability to browse directories and locate a file item. Downloading the file. cs ). ) Wrap long file names in the File Manager’s detail view Customize icons in the Folder Tree. com, lands you with the opportunity of working with a leading technology organization. v 2. You Install react-file-reader (A flexible ReactJS component for handling styled HTML file inputs. files[0]) } On saving, create-react-app will instantly refresh the browser. babelrc configuration file. Looking for the best react native app development companies? Here is the list of the top React native developers with reviews by ADA. Unfortunately it can be quite intimidating. if I add/remove files in File Manager, it will react dynamically on the front-side (so I don’t need to modify the shortcode or put a React is a popular open-source JavaScript library – many of you asked for an easier integration between Power BI and React web applications. Free bootstrap snippets, examples and resources tagged with file-manager, html, css and js. Themes and Skinning JavaScript - jQuery, Angular, React, Vue React Data Grid. Please help me to move forward with a donation by paypal :) The file manager component is used to browse, manage, and organize the files and folders in a file system through a web application. js. js file to a . Build) to the project, which will trigger a restore as part of project build. json file and choose “Enable Restore on Build”. Any FOSS lover is warmly welcomed A lot of people name React components with a capital letter in the file, to distinguish them from regular JavaScript files. To delete one or more files, 1. When viewing a module locally, the files are contained within module-name. Input A file input management component for React. WP Media Folder v5. Upload React website to subdomain Open File Manager Create new folder inside “public_html” Upload whole content of “build” folder into this new created folder. The “React JS Developer” role at one. How can I create a custom command for the Kendo UI File Manager? Creating a Custom Command. Restore on demand Library Manager will restore client-side libraries whenever the libman. It allows the creation of multiple users and each user can have its own directory. In this tutorial we are going to create a task manager application from scratch with react. Once a file item is selected, it (or its properties) is loaded in the previewer. The DevExtreme JavaScript FileManager component allows you to display and manage files and directories for different file systems. To make the functions work as expected, I transpile these into CommonJS format in addition to transpiling React JSX files. 부디 도움이 되길 바랄 뿐입니다. It's a command-line utility connected with the corresponding online repository of packages and is capable of package installation, version management, and dependency management. Step 9: Configuring AVD Manager. This time with Trillo File Manager is an application for Dropbox-like functionality on the top of the GCS cloud storage bucket. However, you don’t want to use the standard file input HTML element, instead use a styled link or button to show the file window. 3 - Binding of UI elements to JavaScript object models. 2. React is one of the best choices for building modern web applications. js file. Default configuration. This is a Sample React Plugin for Apache OODT File Manager. React has a slim API, a robust and evolving ecosystem and a great community. If nothing happens, download Xcode and try again. com and its affiliated web properties is provided "as is" without warranty of any kind. After downloading the installation file of it, double click on it and proceed with the installation. The KendoReact Upload component is part of the KendoReact library of React UI components. common. Say “EDIT MODE”. json React Component by Creating. You can delete the files from My Media, My Documents or My Photos folders. The new React component supports both JavaScript and TypeScript and will help you embed your analytics in a React web application. Our file caching system will have two main parts. Tailor fman to your needs with its powerful plugin system. React can handle a single button, a few pieces of an interface, or an app's entire user interface. js. Requirements Creating a File Upload Component with React. react-native-azure-blob-storage-manager. npm install react-files --save Usage Basic I don't think there is one, but it's such a strange question, React is used on the web as a frontend library, while a file manager runs on your local computer. Then use the Axios library to send the file request to the Laravel server and saves the image in the server. ReactOS is a free and open-source operating system for x86/x64 personal computers intended to be binary-compatible with computer programs and device drivers made for Windows Server 2003. Component { render() { return ( ); } } export default App; File Manager can be initialized using the tag. npm start To create a new build inside dist directory. Disclaimer: The information provided on DevExpress. Download Manager is a system service which optimizes the handling of long-running downloads in the background. The first is a React component, which will wrap around RNFetchBlob’s functionality and respond to changes in the Redux store. This method of deleting corrupted files requires you to close "Windows Explorer" through "Task Manager". json. A list of its key features is given below. The MD file stores transactions, budgets, stock information, bank accounts, and other related data for the Moneydance finance software. The second is a set of actions and reducers on the Redux store which deal specifically with file caching. apiOptions, apiRoot: `http://opuscapita-filemanager-demo-master. Integrate TinyMCE editor in Laravel with a File Manager / Image Upload Jquery PHP July 27, 2019 2,006 views Create Sortable, drag and drop multi-level list with jquery like wordpress menu page All Webix widgets and applications function well in the Angular or React environment. Today we will create File Manager App UI with animation using React Native. Sweet Alert in dark layout; Design Files Removed. Installation. Free bootstrap snippets, examples and resources tagged with file-manager, html, css and js. . dll. When a TypeScript script gets compiled there is an option to generate a declaration file (with the extension . You have high end protection; It also has a file manager that is easy to access. With React, you can create reusable components that are independent of each other. Use the Download button in the toolbar. The following table represents the default texts and messages of the file manager in en culture. This project based course will introduce you to all of the modern toolchain of a React developer in 2020. For example, users can upload images, videos, etc on Facebook, Instagram. Source + Demo. Data List; React DataTable Component Vue based front-end for File Manager Aug 01, 2018 1 min read. To associate your repository with the react-electron topic, visit your repo's landing page and select "manage topics. . 6. Benefits of Hosting React. This will add the LibraryManager NuGet package (Microsoft. Add events to precisely control file/folder operations (folder creation, file uploading, moving, deleting, etc. 6. Go through the following steps for creating React project to download file from server using React. A full list of the compatible frameworks and integration examples you can find on this page . Files and folders in the file system can be sorted in either ascending or descending order simply by npm install --save @opuscapita/react-filemanager @opuscapita/react-filemanager-connector-node-v1. If nothing happens, download GitHub Desktop and try again. import ReactFileReader from 'react-file-reader'; class Because the other files & folders above (some truncated) are usually part of a default react-native init installation, our focus would be on the src folder:. tsx. 5, npm 6. Drag & Drop your files in folders: Drag & Drop and image to move it into a folder, where you can find other files. To configure the AVD Manager click on the respective icon in the menu bar. These first have been selected by most active users and ranking has been given based on the most popular votes. Accessible , tested with AT software like VoiceOver and JAWS, navigable by Keyboard . changing a . target. Install from NPM and include it in your own React build process (using Browserify, Webpack, etc). The project is about uploading a users products/services. The name npm (Node Package Manager) stems from when npm first was created as a package manager for Node. If multiple packages depend on a package - jQuery for example - Bower will download jQuery just once. React. View . Communicating react with asp. holyidiot updated Vuexy - Vuejs, React, HTML & Laravel Admin Dashboard Template with a new update entry: Update [6. 0] – 2020-11-28 Latest Update [6. ts) that functions as an interface to the components in the compiled JavaScript. Added 2021-01-09 file-manager,file-browser spofly Desktop app to find lyrics of currently playing song on spotify. You have two options for creating a file uploader. ReactOS will only be compatible with computers that are compatible with Windows 2003 or XP. ej2-react-filemanager. Basic usage. Any FOSS lover is warmly welcomed React Native ; Bootstrap file-manager examples. Bower provides hooks to facilitate using packages in your tools and workflows. jsx" Select Web => JSX File, and enter file name "FirstReactApp. The FileManager provides an inbuilt Search functionality, allowing you to find the specific file in the currently selected folder. React File Manager: A Lightweight & Customizable Component File upload and download. File Browser Front-end. Paper Kit ReactOS is a free, opensource reimplementation of windows Related: How to Copy and Paste Text, Files and Folders in Linux Terminal. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. Removed complementary design files from the package [5. Uploading Files using HTML5 Uploader. TagSpaces features basic file management operations, so it can be used as simple file manager. To associate your repository with the react-electron topic, visit your repo's landing page and select "manage topics. dhtmlxGrid contains rich API functionality. This is an unparalleled array of features, design elements and reusable components Introduction to Asus file manager. To do so, right-click the libman. /assets/images/camera. Try All UI Components for Free All UI components for React are part of the dhtmlxSuite library. Changing the Webpack config. Add multiple URL to pocket at a time. Create better React apps faster and add data visualizations with the world's fastest, virtualized, real-time React data grid and streaming financial and business charts. File uploading means a user from a client machine wants to upload files to the server. net core on remote server (httpdocs folder) does not work RSS 0 replies Last post 2 hours, 59 minutes ago by fiazahmed An electron based file manager. 10 – WordPress File Manager Using the default WordPress media manager also means that the plugin will be very compatible with all the other plugins you use. It’s not a visual file manager, but it gives a set of functions to easily handle media/files in your Laravel app. html’ file to run on start up. Once a file item is selected, it (or its properties) is loaded in the previewer. Read the full article at: http://bgwebagency. The presence of these handlers enables the buttons and/or the drag & drop responsiveness. (eg. However, newer versions of the program use . The FileManager uses file system providers to access file systems. fastlane/ This folder, as you might React doesn’t have opinions on how you put files into folders. files[0]) } On saving, create-react-app will instantly refresh the browser. No action needed. pdf file to use a . Firebase issue during npm install [5. The standard tool for this task is Babel. The create-react-app utility configures tools such as Babel and webpack for the client-side React application. Build) to the project, which will trigger a restore as part of project build. In this section, we are going to add the Client React component from OpusCapita for navigating the folders and listing the files in our PDF library. nyc_output and coverage folder containing our instrumentation detail. . In terms of frontend frameworks, this React admin dashboard is powered by Material-UI, which is the most popular material-based UI components framework available today. - Added File Manager Page. MONEYDANCE files instead. Blog Post. Page 16. lang. A file input (dropzone) management component for React. Videos: Manage member level settings & Videos created by Users. storybook/main. ) Channels: Manage member level settings & Channel created by Users. It enables the user to perform common file operations such as accessing, editing, uploading, downloading, and sorting files and folders. To upload a file with React and Laravel, create a React file component and backend in Laravel. 2. React Chart. The following sample is extracted from Android 4 ICS emulator image. Similarly, every smartphone has a file manager to view, edit, and create any text files, delete, sort, or rename, copy, and cut whenever required. All npm packages are defined in files called package. ). The official front-end framework for building experiences that fit seamlessly into Microsoft 365. Create shortcuts for files: Hold SHIFT and move a file with drag & drop to another folder in order to create a shortcut Bootstrap snippets. 1. js file. To select a specific file, you need to use the number assigned to it. See bundler defaults for the full list. Bower is optimized for the front-end. 15. React. This allows teams to set conventions that work best for them, and to adopt React in any way they would like to. It also supports uploading a file by dragging it from Windows Explorer to FileManager control. Cezerin is open-source ecommerce platform. Tailor your React grid component according to your needs. dll GitHub - networknt/react-file-manager: A react remote file manager with Light Framework as back end file system. Social Sites Integration: With one click, you can login to your site using Social Sites. Using the arrow keys, move over the desired file or folder and press Space on the keyboard. 9 all file uploads, including those initiated by the File Browser plugin, expect a JSON response (like this one ). In addition to building skills, this curriculum incorporates introspection, perspective shifting, and awareness building. These two parts are very decoupled and only communicate using postMessage. Plugin has two parts: Front-end & Back-end. Note that your changes would be temporary and will not persist between re-runs of your package manager. Other Feature Module in React template: Voice and Video Call File Manager Contacts and Email Departments and Designations Timesheet and Overtime Kanban Board Payroll, Payslip and Payrun Company Policies Performance, Goal Tracking, Training and Promotion Resignation and Termination Faq and Knowledgebase Profile Settings, Profile and Edit Profile 🎈 React Material-UI. August 08, 2018. Store the file in state, and only upload when a user clicks the upload button. Overview. This package help you to upload file and assests from react native project to your azure blob storage service. If you want to download large files/streaming you can use Android Download Manager. React components can be installed via yarn or npm: After install, import the minified CSS file in your app's entry file: File manager built with the help of Suite components: Layout, Grid, DataView, Toolbar, etc. js doesn’t have to be hard and with these few steps, you can do it yourself. js'; class App extends React. Use npm to install the Expo CLI command line utility from the Windows Command Prompt, PowerShell, Windows Terminal, or the integrated terminal in VS Code (View > Integrated Terminal). Clear the cache from admin panel. js , or Total. How you use packages is up to you. html file should reveal our code coverage in a human readable and hopefully revealing way. We will cd into react project react-file-upload – cd react-file-upload Now will install dependencies – npm install bootstrap npm install react-toastify npm install axios The bootstrap help to create ui based on bootstrap 4, react-toastify is use to display beautiful notification into react app and axios for HTTP client. Save time by quickly jumping to directories. CKEditor 4 can be easily integrated with an external file manager (file browser/uploader) thanks to the File Browser plugin which by default is included in the Standard and Full presets. A very smart filemanager to manage your files in the browser developed in AngularJS following Material Design styles by Jonas Sciangula Street. These files will always be rendered/loaded to the page when an instance of the module is on the page (Module instances are the individual rendered modules on the page). FileManager Conclusion To make a file downloadable from your website, start by creating a folder on your server for both your website's HTML page and the file you want to share. Select the file in the manager. It is often used for developing Web Applications or Mobile Apps. mp3, . Edit: But As a web app. 2. You can add a custom thumbnail and text description to every file or folder. That said there are a few common approaches popular in the ecosystem you may want to consider. changing a . Drag and Drop Support in React FileManager component 23 Feb 2021 / 1 minute to read The file manager allows files or folders to be moved from one folder to another by using the allowDragAndDrop property. Once you make the folder, you can find it by using your Control Panel's file manager or the file browser in your FTP program. In XCode, in the project navigator, select your project. files[0]holds the actual file and its details. react-file-manager. React Native ; Bootstrap file-manager examples. Download the App Center SDK for React Native frameworks provided as a zip file and unzip it. React, Node v12. A dual-pane file manager for Mac, Windows and Linux. Restore on demand Library Manager will restore client-side libraries whenever the libman. html, the rest of the website build in React shows nothing (white page). Set Api path for React. To connect the component with the file system items, assign the Remote File System Provider to the fileSystemProvider property. React File Manger Multi-column File Manager based on react-beautiful-dnd. React JS Developers one. js. Here are some of the best places to find up-to-date information on React and TypeScript: React TypeScript Cheatsheets React is a JavaScript library that aims to simplify development of visual interfaces. svg'; // replace it with your path // Specify your default image import defaultUser from '. json. File Manager. css'; import 'devextreme/dist/css/dx. js under dist/ directory. /. The FileBrowser provides the ability to browse directories and locate a file item. Chocolatey is software management automation for Windows that wraps installers, executables, zips, and scripts into compiled packages. React Chart. npm run build. files[0]holds the actual file and its details. This one is a little different. All of the files shared are under GPL License. Like a photo, pdf or any other file type. 3] - 2020-04-04 VueJS + Laravel, HTML + Laravel Updated. in/building-a-full-st Related Posts: How to download file from server using Angular; Prerequisites. Cronus File Manager Live Demo. Files files will be hosted on the server on a cloud service. js under dist/ directory. Allows creating a Progressive Web Apps built with React and Node. A flexible and beautiful Select Input control for ReactJS with multiselect, autocomplete and ajax support. You can add support for other types by adding an assetExts resolver option in your Metro Bower keeps track of these packages in a manifest file, bower. Please note that bluehost doesn’t upload folder and its content. Most common file types are supported including . Scheduler. Go to the My Media/My Documents/My Photos folder. If you don’t have that file already, you just create a blank file, and put that content into it. js is an open-source JavaScript library that is used for building user interfaces specifically for single-page applications. The KendoReact Upload helps users send files from their file systems to dedicated server handlers which are configured to receive them. Basic usage. Mobile applications definitely offer a greater value to businesses than their mobile website • Fixed Spell checker not working and missing Image Advanced Edit button in Node JS SDK • Fixed Unable to load any images or files Python Flask SDK • Fixed Upload Video not working in Rail SDk • Fixed On opening an uploaded file throws "HTTP status 404-Not Found" in Java SDK • Fixed Unable to upload images in Java SDK • Fixed On opening an uploaded file throws "Template is missing The require syntax described above can be used to statically include audio, video or document files in your project as well. If nothing happens, download GitHub Desktop and try again. xml :This file contain list . com and its affiliated web properties is provided "as is" without warranty of any kind. As of Kendo UI R1 2020 SP1 the kendo. Go to the My Media/My Documents/My Photos folder. Click on the Delete button. Material Dashboard React Nodejs . ) Simple Example. 2] - 2020-02-18 React Aaded. react-dom@^16. npm install -g expo-cli Use Expo to create a React Native app that runs on iOS, Android, and web. For the former, there is a library called react-dropzone that is built with React. File Operations. This happened right after updating the code when I tried to upload some . Pass the endpointUrl to the remote file system provider object to specify the Url at which the component can access the file system items. First we need to install the dependencies for React. ui. Note: In the documentation, the package babel-plugin-styled-components is specified, as well as a . This project provides a web file manager interface, allowing you to create your own backend connector following the connector API. 3 - Upgraded React version to 17. However, it is not designed to work with SSR. target. Mvc4. There’s nothing more to add, just check out our demo to get a clear idea of what you can do with it. It’s used for handling the view layer for web and mobile apps. jsx) by right clicking on container folder script => react folder select a file from new items dialog popup and click on Add button. /data. This is where Babel macros come in. 07 August 2019. This prod Nowadays, Node Package Manager (npm) is one of the most demanded gadgets in the web developer tool belt. You can open the Task Manager by 2 options. The value should be an async function that receives a webpack config and eventually returns a webpack config. js on cPanel. Like a photo, pdf or any other file type. Along the way, we will build a massive e-commerce application similar to Shopify using React, Redux, React Hooks, React Router, GraphQL, Context API, Firebase, Redux-Saga, Stripe + more. js on cPanel. bat ) as this generates a few files required by the build (such as SharedAssemblyVersionInfo. With this in place, feel free to open the solution file in Visual Studio or VS Code. Free Web File Thunar is developed by Benedikt Meurer, and was originally intended to replace XFFM, Xfce's previous file manager. React Scheduler Storybook is an open source tool for developing UI components in isolation for React, Vue, and Angular. Ignite UI for React also includes the most complete Microsoft Excel solution and 60+ chart types with interactive panning and zooming, touch support and much more. An online file manager which can be used on its own, or as a plugin for a rich-text editor such as CKeditor, TinyMCE or FCKeditor. png extension) Let’s take a quick look at how to manage those breaking user interactions: to the . Option 1: Type "task" in the search box beside the Start menu, and press Enter when you see the "Task Manager" app. /. export const fileItems = [{ 'name': 'Documents', 'isDirectory': true, 'category': 'Work', 'items': [{ 'name': 'Projects', 'isDirectory': true, 'category': 'Work The Custom File System Provider allows you to implement custom APIs to handle file operations (add, delete, rename, etc. Initially, the selectedFilestate is set to null Next time you’re looking for a file, it’s just a click away in the file manager. This will add the LibraryManager NuGet package (Microsoft. . The Edit screen with option to select one or more files is displayed. js) as shown below. Webix suggests a ready-made solution, which is JS File manager, that can be built into any web application. Fileside Modern, tiling file manager with unlimited panes. I have a demo on Laravel + React. React, Redux, Material UI, Nodejs, ExpressJs . We use Gatsby with TypeScript for this website, so that can also be a useful reference implementation. module folders. Work fast with our official CLI. To select a file or folder: 1. log(event. Store the file in state, and only upload when a user clicks the upload button. wav, . You can then use the Dropzone component to render the HTML5 Drag What we would like to see from a project manager is the following: - A candidate that can manage: 1 - Experience with React context api . 90/5. Declaration files. It come with unlimited customized email with your domain. Looking for the best react native app development companies? Here is the list of the top React native developers with reviews by ADA. To select a specific file, you need to use the number assigned to it. Angle - Responsive Bootstrap Admin Template. Use it as a child component of you application. As with any programming problem, there are many ways to achieve this outcome. Run npm install and npm start after that. For initialising file manager you have to install and run both of them from terminal with commands . 3. 5. react-files. /. A predictable state container for JavaScript apps. xcodeproj file. JSX Now, we need to create a first component to create a file (. A simple file manager built with react. Client implementation is an npm package which can be embed into your application. Grouping by features or routes One common way to structure projects is to locate CSS, JS, and tests together inside folders grouped by feature or route. yarn add react yarn add react-dom yarn add --dev parcel-bundler. 11, React 16/17. The File Manager is a graphical user interface component used to manage the file system. mp4, . Delete a file. Web. expo-file-system ( docs) expo-media-library ( docs) After you’ve done that we can proceed. Create shortcuts for files: Hold SHIFT and move a file with drag & drop to another folder in order to create a shortcut There are several possible ways of using Webix with React: using a Webix widget in a React app; creating a custom Webix+React component; using a Webix widget with Redux; How to Start. Option 1: Package Manager. This article explains a simple way to implement the approach to upload a single file with React. Software Package Manager. View demo Download Source. Say “MORE OPTIONS” 3. Free . You can assign custom color to every folder and tag, which makes the visual search an easy step. - Fixed minor bugs. net/` // Or you React File Manager Usage (iOS) First you need to install react-native-file-manager: npm install react-native-file-manager --save. " File Manager. Step 8: Configuring AVD Manager. On the backend, we are going to use Laravel’s Storage API to store images. Now, you can start adding Essential JS 2 File Manager component to the application. 3. All basic file handling mechanisms like upload, download, read, edit, delete, search, and sort can be performed to manage and organize the files and folder in a file system. ly/3d8cXTx To learn more about the react-native visit: The FileManager UI component can work with a file system located on the server. Angular React Vue jQuery PeaZip is a free archiver tool. config. com is looking for React JS Developers for our team in Delhi/NCR (India) Employment: Permanent Employment Place of Work: Delhi/NCR (India) CTC: Best in the industry Role. Complete file and folder manager: Create, rename, move and delete a folder. Reference React. 0 / scheduler@^0. Use the fileSystemProvider property to configure the component's file system provider. Video-React is a web video player built from the ground up for an HTML5 world using React library. Let’s begin with the Redux side of things: The Redux Code Unlike the other frameworks covered in this module, React does not enforce strict rules around code conventions or file organization. Free Frontend Preset For Nodejs . That's when we got the idea to create an orthodox WEB file manager, working on the server's site, which would be able to copy between different sources with server speed and would offer: file and directory search, a disk usage analyzer (an analogue of ncdu), simple file uploading and a lot of other great stuff. In traditional HTML sites, the file upload form forces a page refresh, which might be confusing to users. Chocolatey integrates w/SCCM, Puppet, Chef, etc. The React File Manager component allows for the easy uploading and downloading of files in a Sorting. KFM – Kae’s File Manager. Webix File Manager is a ready-made SPA. " For example, I prepare a page “Commercial files” where I will put a shortcode corresponding to the folder of files uploaded in File Manager or Google Drive. React was first created by Jordan Walke, a software engineer working for Facebook. Web The JavaScript Client Library for Azure Storage enables many web development scenarios using storage services like Blob, Table, Queue, and File, and is compatible with modern browsers. npm install npm run start Design. thumbnail support for image files; built-in media player; text editor; many other features. I want to do a very simple file explorer in react that look like the one of Files for google. Multi-Selection. js, and Mongo. React Shopping Cart. More Template Epic React - HR Management Admin Template is High Resolution: Yes, Compatible Browsers: Firefox, Safari, Opera, Chrome, Edge, Compatible With: ReactJS, Bootstrap 4. These first have been selected by most active users and ranking has been given based on the most popular votes. So follow the below setups:- 1) Install the @react-native-community/checkbox package like below in your application 2) Link the module in your application 3) Import Get code examples like "usenavigate react" instantly right from your google search results with the Grepper Chrome Extension. 6. Disclaimer: The information provided on DevExpress. 0/v14. css'; import FileManager from 'devextreme-react/file-manager'; import { fileItems } from '. Since CKEditor 4. In the process the compiler strips away all function and method bodies and preserves only the signatures of the types that are exported. You have to manually create sub-folder, then upload files into that folder. React Filemanager Hello ex angular-filemanager user, this is the new version in React. The application provides an unified, natively portable, cross-platform file manager and archive manager GUI for many Open Source technologies like 7-Zip, FreeArc, PAQ, UPX. But first, here are the benefits of hosting your React. The ASP. It is developed by laborasyon on ThemeForest. In this tutorial, we will upload an image from the react js component. 80/5. React is an open-source JavaScript library developed by Facebook used for creating web frontend and UI components. 14. File Manager and Core Data: Used to save photo, video, audio, and pdf data to the ios device url sessions: Used to communicated with the server to upload the data to the Utah State Geographical Cuba admin is super flexible, powerful, clean & modern responsive bootstrap 5 admin template with unlimited possibilities. Or if you have the optional Yarn package manager installed. React & JavaScript articles. The File Manager component supports multiple selections of files and folders in a file system. Storybook - GitHub Pages angular-filemanager. It is worth noting the beautiful design, and a ready-made set of icons, which are included in the delivery. 10. Work fast with our official CLI. Beside Material-UI, we also integrated, with the same design style, over 80 React widgets and plugins. Click on the Next button you will see a System React Fixed. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. NullPointerException; TypeError: string indices must be integers – Python; valueerror: setting an array element with a sequence – Python; TypeError: a bytes-like object is required, not ‘str’ – Python Drop files, select on filesystem, copy and paste files, or add files using the API. To start the app server it will display live changes (optional) 4. 그럼 스타뜨! 배경 DCE내에서 파일 업로드 및 관리를 할수 있는 GUI 화면이 필요했다. React Scheduler Disclaimer: The information provided on DevExpress. The issue with this is that, because we’re using create-react-app, we can’t configure a lot of things unless we eject. This package support multiple files selection, cloud storage integration. Creating a file upload component is a common task in web development. Developer Express Inc disclaims all warranties, either express or implied, including the warranties of merchantability and fitness for a particular purpose. At least two fields must be present in the definition file: name and version. 0] – 2020-11-28 HTML, HTML + Laravel ADDED All-new design based on Ul/UX principles New Bordered & Dark layout New eCommerce Dashboard Invoice Bamburgh React Admin Dashboard with Reactstrap PRO is built entirely on React and uses the popular starter kit Create React App from Facebook. Now my APP only show that green circle button located in index. e. Just be sure to follow the installation instructions for “bare” or plain react-native apps. json file. Build files will be created build. First, we install dependencies using npx then download the laravel project. 4. Download Epic React – HR Management Admin Template nulled from the below download links and if the item satisfy you then buy it from the developer puffintheme for commercial use. - Minor fixes of RTL SCSS. npm install --save react npm install --save react-dom npm install --save-dev parcel-bundler. js - TS docs; Gatsby - TS Docs; All of these are great starting points. Finally, what all this was leading up to, opening that index. I guess it's technically possible to write a file manager in Node, use React for the UI, and package it as a desktop app with Electron, but I would still not call that "React based" (and C. js in your Greg Fodor - Engineering Manager Mozilla The development team involved have been very impressed by the React Admin framework and it has been capable of handling the complex challenges we have had for it thusfar. It uses React framework and supports connectors to different file storages. Drag & Drop your files in folders: Drag & Drop and image to move it into a folder, where you can find other files. File Manager: Admin can import/export & upload new files. packages. Simple event handlers are also provided as props to the browser, which allow it to respond to actions on the files. Thus you will get an example of integration usage. onChangeHandler=event=>{ console. . It has a large UI collection. dll. onChangeHandler=event=>{ console. import React from 'react'; import ReactDOM from 'react-dom'; import { FileManager, FileNavigator } from '@opuscapita/react-filemanager'; import connectorNodeV1 from '@opuscapita/react-filemanager-connector-node-v1'; const apiOptions = { connectorNodeV1. json file is saved. API-first CMS. json file and choose “Enable Restore on Build”. FileManager also performs operations like creating a new folder, moving files, and searching. This control is part of the Telerik UI for ASP. Created from revision f160547f47 on 12/4/2020. All APIs that implement access to Azure Blob Storage on the client are stored in the azure-file-system. banzay/friends-app-redux Second take on friends app. 4 - Creating RESTful services with Package Manager stores application information in three files, located in /data/system. Is the Excel Viewer widget compatible with the Webix community (opensource) edition? PHP & MySQL Projects for $2 - $10. rtl8761a_mp_chip_bt40_fw_asic_rom_patch_8192eu_new. d. Select the file to upload from the file selector dialog box; Downloading a file. pdf. NET Core FileManager lets your users browse through directories and files, akin to file managers like Windows Explorer, and manage file storage within their web applications. spatie/laravel-medialibrary Released: August 2015 Installs: 178 000 Last update: May 2017 (1 day ago). Note the command dotnet new react; this is the template I’m using for this React project. Your domain will look in this ‘public_html’ folder for a top ‘index. You'll see a folder named AppCenterReactNativeShared which contains a single framework for the required React Native iOS bridge. Be it a web-based gaming experience where you store state information in the Table service, uploading photos to a Blob account from a Mobile app, or an entire CodeSandbox at its core consists of two parts: the editor and the preview. Predefined connectors are: Client React connector for Server Node API v1 Localization in React FileManager component The file manager can be localized to any culture by defining the texts and messages of the file manager in the corresponding culture. babelrc file present in the application root folder. jpeg extension) Uploading an image where the file extension has been intentionally changed and Cloudinary could process it, but the DOM could not render the file (eg. Web based File Manager Manage files online From within the free control panel, an easy to use File Manager helps you to upload files, download files or even edit HTML, PHP or other programming language files. 2. The content of package. Create a new file called manager. All basic file operations like creating a new folder, uploading and downloading of files in the file system, and deleting and renaming of existing files and folders are available in the file manager component. Used technologies. askwon/Filet-Manager Web-based file transfer client written in React, Redux, and Go; ayxos/react-cellar Typescript, MongoDb, Webpack, EC6, Typings, Redux Wine-Cellar; azu/read-all-later [Electron] Read All Later is a client for Pocket. - Added New Auth pages. To run the service, create an Amazon S3 account and a S3 bucket and then register your amazon S3 client account details like bucketName, awsAccessKeyId, awsSecretKeyId and awsRegion details in RegisterAmazonS3 method to perform the file operations. 9. 5. Angle is an admin template based on Bootstrap and multiple frameworks. Personalize your React grid with flexible API. Organizing your blog media files with the Real Media Library plugin is as easy as dragging and dropping them into folders. JavaScript File Manager or in other words File Explorer is a web widget, part of the UI framework for managing files. Developed at Facebook and released to the world in 2013, it drives some of the most widely used apps, powering Facebook and Instagram among countless other applications. light. . Here native file viewer means we are not going to view the file in our application instead we will pick the file from file picker and will pass the file URL to FileViewer component provided by react-native-file-viewer, this component will trigger the native iOS/Android file viewer to open the file. Say “MORE OPTIONS” 3. ). To delete one or more files, 1. To download a remote file’s content to a local file on the device, here’s the code: Hi Dev, In this blog, I will show you how to install file manager package in laravel application. Install Step 1 npm i react-native-azure-blob-storage-manager --save Step 2 Dependencies npm install --save react-native-background-upload iOS cd ios pod install Manual Installation Installation. Node. Use Git or checkout with SVN using the web URL. Page 16. Documentation. We are going to use react-dropzone to build an image uploader. Hello ex angular-filemanager user, this is the new version in React. react file manager diff --git a/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager.txt.xml.xls b/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..b162ca54b5175f545e6d964c9ad0ef56ec5c5dc9 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Apache OODT File Manager/React file manager.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-relation.txt" "b/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-relation.txt" index fbb01d68e919c4da23d6cedfdae91f79d032c64b..adfa85b220fdb253bc01ab03e47341934f482e06 100644 --- "a/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-relation.txt" +++ "b/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-relation.txt" @@ -1,100 +1,221 @@ -Introduction -This is the developer guide for the Apache OODT Catalog and Archive Service (CAS) File Manager component, or File Manager for short. Primarily, this guide will explain the File Manager architecture and interfaces, including its tailorable extension points. For information on installation, configuration, and examples, please see our User Guides. - -The remainder of this guide is separated into the following sections: - -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Project Description -The File Manager component is responsible for tracking, ingesting and moving file data and metadata between a client system and a server system. The File Manager is an extensible software component that provides an XML-RPC external interface, and a fully tailorable Java-based API for file management. - -Architecture -In this section, we will describe the architecture of the File Manager, including its constituent components, object model, and key capabilities. - -Components -The major components of the File Manager are the Client and Server, the Repository Manager, the Catalog, the Validation Layer, the Versioner, and the Transferer. The relationship between all of these components are shown in the diagram below: - -File Manager Architecture - -The File Manager Server contains both a Repository that manages products (and the products' location in the archive as specified by Versioner), and a Catalog that validates metadata via the Validation Layer. Transfer of data products from the Client to the Server is the domain of the Transfer and can be initiated at either the Client or the Server. - -Object Model -The critical objects managed by the File Manager include: - -Products - Collections of one or more files, and their associated Metadata. -Metadata - A map of key->multiple values of descriptive information about a Product. See CAS-Metadata for more information on Metadata. -Reference - A pointer to a Product file's (or files') original location, and to its final resting location within the archive constructed by the File Manager. -Product Type - Descriptive information about a Product that includes what type of file URI generation scheme to use, the root repository location for a particular Product, and a description of the Product. -Element - A singular Metadata element, such as "Author", or "Creator". Elements may have additional metadata, in the form of the associated definition and even a corresponding Dublin Core attribute. See CAS-Metadata for more information on Metadata Elements. -Versioner - A URI generation scheme for Product Types that defines the location within the archive (built by the File Manager) where a file belonging to a Product (that belongs to the associated Product Type) should be placed. -Each Product contains 1 or more References, and one Metadata object. Each Product is a member of a single Product Type. The Metadata collected for each Product is defined by a mapping of Product Type->1...* Elements. Each Product Type has an associated Versioner. These relationships are shown in the below figure. - -File Manager Object Model -Key Capabilities -The File manager has been designed with a new of key capabilities in mind. These capabilities include: - -Easy management of different types of Products. The Repository Manager extension point is responsible for managing Product Types, and their associated information. Management of Product Types includes adding new types, deleting and updating existing types, and retrieving Product Type Objects, by their ID or by their name. - -Support for different kinds of back end catalogs. The Catalog extension point allows Product instance metadata and file location information to be stored in different types of back end data stores quite easily. Existing implementations of the Catalog interface include a JDBC based back end database, along with a flat-file index powered by Lucene. - -Management of Product instance information. Management includes adding, deleting and updating product instance information, including file locations (References), along with Product Metadata. It also includes retrieving Metadata and References associated with existing Products as well as obtaining the Products themselves. - -Element management for Metadata. The File Manager's Validation Layer extension point allows for the management of Element policy information in different types of back end stores. For instance, Element policy could be stored in XML files, a Database, or a Metadata Registry. - -Data transfer mechanism interface. By having an extension point for Data Transfer, the File Manager can support different Data Transfer protocols, both local and remote. - -Advanced support for File Repository layouts. The Versioner extension point allows for different File Repository layouts based on Product Types. - -Support for multiple Product structures. The File Manager Client allows for Products to be Flat, or Hierarchical-based. Flat products are collections of singular files that are aggregated together to make a Product. Hierarchical Products are Products that contain collections of directories, and sub-directories, and files. - -Design for scalability. The File Manager uses the popular client-server paradigm, allowing new File Manager servers to be instantiated, as needed, without affecting the File Manager clients, and vice-versa. - -Standard communication protocols. The File Manager uses XML-RPC as its main external interface between the File Manager client and server. XML-RPC, the little brother of SOAP, is fast, extensible, and uses the underlying HTTP protocol for data transfer. - -RSS-based Product syndication. The File Manager web interface allows for the RSS-based syndication of Product feeds based on Product Type. - -Data transfer status tracking. The File Manager tracks all current Product and File transfers and even publishes an RSS-feed of existing transfers. - -This capability set is not exhaustive, and is meant to give the user a feel for what general features are provided by the File Manager. Most likely the user will find that the File Manager provides many other capabilities besides those described here. - -Extension Points -We have constructed the File Manager making use of the factory method pattern to provide multiple extension points for the File Manager. An extension point is an interface within the File Manager that can have many implementations. This is particularly useful when it comes to software component configuration because it allows different implementations of an existing interface to be selected at deployment time. - -The factory method pattern is a creational pattern common to object oriented design. Each File Manager extension point involves the implementation of two interfaces: an extension factory and an extension implementation. At run-time, the File Manager loads a properties file specifies a factory class to use during extension point instantiation. For example, the File Manager may communicate with a database-based Catalog and an XML-based Element Store (called a Validation Layer), or it may use a Lucene-based Catalog and a database-based Validation Layer. -Using extension points, it is fairly simple to support many different types of what are typically referred to as "plug-in architectures." Each of the core extension points for the File Manager is described below: - -Catalog The Catalog extension point is responsible for storing all the instance data for Products, Metadata, and for file References. Additionally, the Catalog provides a query capability for Products. -Data Transfer The Data Transfer extension point allows for the movement of a Product to and from the archive managed by the File Manager component. Different protocols for Data Transfer may include local (disk-based) copy, or remote XML-RPC based transfer across networked machines. -Repository Manager The Repository Manager extension point provides a means for managing all of the policy information (i.e., the Product Types and their associated information) for Products managed by the File Manager. -Validation Layer The Validation Layer extension point allows for the querying of element definitions associated with a particular Product Type. The extension point also maps Product Type to Elements. -Versioning The Versioning extension point allows for the definition of different URI generation schemes that define the final resting location of files for a particular Product. -System The extension point that provides the external interface to the File Manager services. This includes the File Manager server interface, as well as the associated File Manager client interface, that communicates with the server. -Current Extension Point Implementations -There are at least two implementations of all of the aforementioned extension points for the File Manager. Each extension point implementation is detailed in this section. - -Catalog -Data Source based Catalog. An implementation of the Catalog extension point interface that uses a JDBC accessible database backend. -Lucene based Catalog. An implementation of the Catalog extension point interface that uses the Lucene free text index system to store Product instance information. -Data Transfer -Local Data Transfer. An implementation of the Data Transfer interface that uses Apache's commons-io to perform local, disk based filesystem data transfer. This implementation also supports locally accessible Network File System (NFS) disks. -Remote Data Transfer. An implementation of the Data Transfer interface that uses the XML-RPC File Manager client to transfer files to a remote XML-RPC File Manager server. -InPlace Data Transfer. An implementation of the Data Transfer interface that avoids transfering any products -- this can be used in the situation where metadata about a particular product should be recorded, but no physical transfer needs to occur. -Repository Manager -Data Source based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in a JDBC accessible database. -XML based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in an XML file called product-types.xml -Validation Layer -Data Source based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in a JDBC accessible database. -XML based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in 2 XML files called elements.xml and product-type-element-map.xml -System (File Manager client and File Manager server) -XML-RPC based File Manager server. An implementation of the external server interface for the File Manager that uses XML-RPC as the transportation medium. -XML-RPC based File Manager client. An implementation of the client interface for the XML-RPC File Manager server that uses XML-RPC as the transportation medium. -Use Cases -The File Manager was built to support several of the above capabilities outlined in Section 3. In particular there were several use cases that we wanted to support, some of which are described below. - -File Manager Ingest Use Case -The red numbers in the above Figure correspond to a sequence of steps that occurs and a series of interactions between the different File Manager extension points in order to perform the file ingestion activity. In Step 1, a File Manager client is invoked for the ingest operation, which sends Metadata and References for a particular Product to ingest to the File Manager server’s System Interface extension point. The System Interface uses the information about Product Type policy made available by the Repository Manager in order to understand whether or not the product should be transferred, where it’s root repository path should be, and so on. The System Interface then catalogs the file References and Metadata using the Catalog extension point. During this catalog process, the Catalog extension point uses the Validation Layer to determine which Elements should be extracted for the particular Product, based upon its Product Type. After that, Data Transfer is initiated either at the client or server end, and the first step to Data Transfer is using the Product’s associated Versioner to generate final file References. After final file References have been determined, the file data is transferred by the server or by the client, using the Data Transfer extension point. - -Conclusion -The aim of this document is to provide information relevant to developers about the CAS File Manager. Specifically, this document has described the File Manager's architecture, including its constituent components, object model and key capabilities. Additionally, the this document provides an overview of the current implementations of the File Manager's extension points. \ No newline at end of file +Introduction This&Catalog&依赖 +its&points& +guide&tailorable extension point&依赖 +guide&File Manager architecture and interface&依赖 +our&Guides& +remainder§ion&依赖 +remainder&guide&AGGREGATION +we&architecture&依赖 +we&constituent component&依赖 +its&components& +we&architecture&依赖 +we&constituent component&依赖 +architecture&File Manager&AGGREGATION +Client and Server&major component&依赖 +Client and Server&File Manager&依赖 +major component&File Manager&AGGREGATION +relationship&diagram&依赖 +Catalog&metada&依赖 +File Manager Server&Repository&依赖 +Catalog&Validation Layer&依赖 +products&location& +File Manager Server&' location&依赖 +Repository&product&依赖 +File Manager Server&archive as&依赖 +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +Transfer&data product&AGGREGATION +Transfer&Transfer&依赖 +domain&Transfer&AGGREGATION +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +Transfer&Transfer&依赖 +collection&one or more file&AGGREGATION +their&Metadata& +A map&key&AGGREGATION +> multiple value&descriptive information&AGGREGATION +See&Metadata&依赖 +See&more information&依赖 +its&location& +File Manager&what type&依赖 +File Manager&file urus generation scheme&依赖 +what type&file urus generation scheme&AGGREGATION +description&Product&AGGREGATION +element&form&依赖 +element&associated definition&依赖 +form&associated definition&AGGREGATION +element&additional metada&依赖 +A URI generation scheme&location&依赖 +A URI generation scheme&built&依赖 +A URI generation scheme&archive (&依赖 +Product&1 or more reference&依赖 +member&single Product Type&AGGREGATION +Product&single Product Type&依赖 +Product&mapping&依赖 +Product&Product Type&依赖 +mapping&Product Type&AGGREGATION +Product Type&associated Versioner&依赖 +relationship&below figure&依赖 +new&key capability&AGGREGATION +Easy management&different type&AGGREGATION +different type&Products&AGGREGATION +their&information& +their&ID& +Management&Product Types&AGGREGATION +Management&new type&依赖 +their&name& +Management&new type&依赖 +different kind&back end catalog&AGGREGATION +Catalog extension point&extension point&GENERALIZATION +Catalog extension point&product instance metada and file location information&依赖 +different type&back end data store&AGGREGATION +implementation&end database&依赖 +implementation&JDBC&依赖 +implementation&end database&依赖 +implementation&JDBC&依赖 +implementation&Catalog interface&AGGREGATION +Management&Product instance information&AGGREGATION +Management&include add , delete and update product instance information&依赖 +Management&file location&依赖 +It&Metadata and reference&依赖 +Manager&point& +different type&back end store&AGGREGATION +management&Element policy information&AGGREGATION +Element policy&instance&依赖 +Element policy&XML file&依赖 +Data Transfer&Transfer&GENERALIZATION +File Manager&different Data Transfer protocol&依赖 +Versioner extension point&different File Repository layout&依赖 +Versioner extension point&extension point&GENERALIZATION +Flat product&singular file&依赖 +collection&singular file&AGGREGATION +Products&directory&依赖 +collection&directory&AGGREGATION +File Manager&popular client-server paradigm&依赖 +File Manager&XML-RPC&依赖 +its&interface& +File Manager&File Manager client and server&依赖 +File Manager&main external interface&依赖 +little brother&SOAP&AGGREGATION +File Manager web interface&RSS-based syndication&依赖 +RSS-based syndication&Product feed&AGGREGATION +datum&status tracking&依赖 +RSS-feed&transfer&AGGREGATION +File Manager¤t Product and File transfer&依赖 +Extension Points We&file manager make use&依赖 +file manager make use&factory method pattern&AGGREGATION +interface&many implementation&依赖 +extension point&File Manager&依赖 +different implementation&interface&AGGREGATION +it&software component configuration&依赖 +it&different implementation&依赖 +it&interface&实现 +File Manager extension point&implementation&依赖 +File Manager extension point&two interface&实现 +implementation&two interface&AGGREGATION +File Manager extension point&extension factory&实现 +File Manager load&factory class&依赖 +File Manager load&factory class&依赖 +File Manager load&run-time&依赖 +File Manager load&run-time&依赖 +File Manager&example&依赖 +File Manager&database-based Catalog&依赖 +it&Lucene-based Catalog&依赖 +it&many different type&实现 +The Data Transfer extension point&Product&依赖 +The Data Transfer extension point&movement&依赖 +The Data Transfer extension point&archive&依赖 +movement&Product&AGGREGATION +Different protocol&local ( disk-based ) copy&依赖 +Different protocol&local ( disk-based ) copy&依赖 +extension point&Product Type&依赖 +extension point&element&依赖 +different URI generation scheme&file&依赖 +different URI generation scheme&final resting location&依赖 +final resting location&file&AGGREGATION +different URI generation scheme&particular Product&依赖 +definition&different URI generation scheme&AGGREGATION +Catalog Data Source&base catalog&依赖 +implementation&Catalog extension point interface&AGGREGATION +implementation&a jdbc accessible database backend&依赖 +lucene base catalog&lucene base catalog&依赖 +implementation&catalog extension point interface&AGGREGATION +implementation&catalog extension point interface&依赖 +lucene base catalog&Lucene free text index system&依赖 +implementation&catalog extension point interface&依赖 +implementation&Data Transfer interface&AGGREGATION +Apache&commons-io& +implementation&locally accessible network file system ( nfs ) disk&依赖 +implementation&XML-RPC File Manager client&依赖 +XML-RPC File Manager client&File Manager client&GENERALIZATION +implementation&XML-RPC File Manager client&依赖 +implementation&datum transfer interface&AGGREGATION +InPlace Data Transfer .&product&依赖 +implementation&product type policy information&依赖 +implementation&product type policy information&依赖 +implementation&repository manager extension point&AGGREGATION +implementation&JDBC accessible database&依赖 +implementation&JDBC accessible database&依赖 +XML file&file&GENERALIZATION +implementation&JDBC accessible database&依赖 +implementation&JDBC accessible database&依赖 +implementation&element policy information&依赖 +implementation&element policy information&依赖 +implementation&validation layer extension point&AGGREGATION +Validation Layer extension point&2 XML file&依赖 +implementation&) xml-rpc&依赖 +Validation Layer extension point&Element policy information&依赖 +implementation&( file manager client&依赖 +implementation&) xml-rpc&依赖 +implementation&Validation Layer extension point&AGGREGATION +implementation&( file manager client&依赖 +File Manager&transportation medium&依赖 +implementation&File Manager&依赖 +File Manager&use xml-rpc&依赖 +implementation&File Manager&依赖 +File Manager&use xml-rpc&依赖 +implementation&File Manager&依赖 +File Manager&transportation medium&依赖 +implementation&external server interface&AGGREGATION +XML-RPC&File Manager client&依赖 +implementation&client interface&AGGREGATION +implementation&XML-RPC File Manager server&依赖 +XML-RPC File Manager server&transportation medium&依赖 +implementation&XML-RPC File Manager server&依赖 +XML-RPC File Manager server&XML-RPC&依赖 +implementation&XML-RPC File Manager server&依赖 +several&above capability&AGGREGATION +we&that&依赖 +manager ingest use case red number&step&依赖 +manager ingest use case red number&step&依赖 +sequence&step&AGGREGATION +series&interaction&AGGREGATION +manager ingest use case red number&sequence&依赖 +manager ingest use case red number&sequence&依赖 +ingest operation&ingest&依赖 +File Manager client&Step 1&依赖 +ingest operation&a particular product&依赖 +File Manager client&ingest operation&依赖 +ingest operation&Metadata and References&依赖 +server&point& +System Interface&information&依赖 +it&path& +System Interface&made&依赖 +System Interface&Product Type policy&依赖 +Metadata&Catalog extension point&依赖 +System Interface&file reference&依赖 +Catalog extension point&catalog process&依赖 +Catalog extension point&Validation Layer&依赖 +its&Type& +first step&’s associated versioner&依赖 +Product&Versioner& +first step&’s associated versioner&依赖 +Data Transfer&client or server end&依赖 +aim&document&AGGREGATION +document&architecture&依赖 +document&constituent components , object model and key capability&依赖 +Manager&architecture& +its&model& +current implementation&extension point&AGGREGATION +overview¤t implementation&AGGREGATION +Manager&points& diff --git "a/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-simEnts.txt" "b/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-simEnts.txt" deleted file mode 100644 index fbb01d68e919c4da23d6cedfdae91f79d032c64b..0000000000000000000000000000000000000000 --- "a/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide-simEnts.txt" +++ /dev/null @@ -1,100 +0,0 @@ -Introduction -This is the developer guide for the Apache OODT Catalog and Archive Service (CAS) File Manager component, or File Manager for short. Primarily, this guide will explain the File Manager architecture and interfaces, including its tailorable extension points. For information on installation, configuration, and examples, please see our User Guides. - -The remainder of this guide is separated into the following sections: - -Project Description -Architecture -Extension Points -Current Extension Point Implementations -Project Description -The File Manager component is responsible for tracking, ingesting and moving file data and metadata between a client system and a server system. The File Manager is an extensible software component that provides an XML-RPC external interface, and a fully tailorable Java-based API for file management. - -Architecture -In this section, we will describe the architecture of the File Manager, including its constituent components, object model, and key capabilities. - -Components -The major components of the File Manager are the Client and Server, the Repository Manager, the Catalog, the Validation Layer, the Versioner, and the Transferer. The relationship between all of these components are shown in the diagram below: - -File Manager Architecture - -The File Manager Server contains both a Repository that manages products (and the products' location in the archive as specified by Versioner), and a Catalog that validates metadata via the Validation Layer. Transfer of data products from the Client to the Server is the domain of the Transfer and can be initiated at either the Client or the Server. - -Object Model -The critical objects managed by the File Manager include: - -Products - Collections of one or more files, and their associated Metadata. -Metadata - A map of key->multiple values of descriptive information about a Product. See CAS-Metadata for more information on Metadata. -Reference - A pointer to a Product file's (or files') original location, and to its final resting location within the archive constructed by the File Manager. -Product Type - Descriptive information about a Product that includes what type of file URI generation scheme to use, the root repository location for a particular Product, and a description of the Product. -Element - A singular Metadata element, such as "Author", or "Creator". Elements may have additional metadata, in the form of the associated definition and even a corresponding Dublin Core attribute. See CAS-Metadata for more information on Metadata Elements. -Versioner - A URI generation scheme for Product Types that defines the location within the archive (built by the File Manager) where a file belonging to a Product (that belongs to the associated Product Type) should be placed. -Each Product contains 1 or more References, and one Metadata object. Each Product is a member of a single Product Type. The Metadata collected for each Product is defined by a mapping of Product Type->1...* Elements. Each Product Type has an associated Versioner. These relationships are shown in the below figure. - -File Manager Object Model -Key Capabilities -The File manager has been designed with a new of key capabilities in mind. These capabilities include: - -Easy management of different types of Products. The Repository Manager extension point is responsible for managing Product Types, and their associated information. Management of Product Types includes adding new types, deleting and updating existing types, and retrieving Product Type Objects, by their ID or by their name. - -Support for different kinds of back end catalogs. The Catalog extension point allows Product instance metadata and file location information to be stored in different types of back end data stores quite easily. Existing implementations of the Catalog interface include a JDBC based back end database, along with a flat-file index powered by Lucene. - -Management of Product instance information. Management includes adding, deleting and updating product instance information, including file locations (References), along with Product Metadata. It also includes retrieving Metadata and References associated with existing Products as well as obtaining the Products themselves. - -Element management for Metadata. The File Manager's Validation Layer extension point allows for the management of Element policy information in different types of back end stores. For instance, Element policy could be stored in XML files, a Database, or a Metadata Registry. - -Data transfer mechanism interface. By having an extension point for Data Transfer, the File Manager can support different Data Transfer protocols, both local and remote. - -Advanced support for File Repository layouts. The Versioner extension point allows for different File Repository layouts based on Product Types. - -Support for multiple Product structures. The File Manager Client allows for Products to be Flat, or Hierarchical-based. Flat products are collections of singular files that are aggregated together to make a Product. Hierarchical Products are Products that contain collections of directories, and sub-directories, and files. - -Design for scalability. The File Manager uses the popular client-server paradigm, allowing new File Manager servers to be instantiated, as needed, without affecting the File Manager clients, and vice-versa. - -Standard communication protocols. The File Manager uses XML-RPC as its main external interface between the File Manager client and server. XML-RPC, the little brother of SOAP, is fast, extensible, and uses the underlying HTTP protocol for data transfer. - -RSS-based Product syndication. The File Manager web interface allows for the RSS-based syndication of Product feeds based on Product Type. - -Data transfer status tracking. The File Manager tracks all current Product and File transfers and even publishes an RSS-feed of existing transfers. - -This capability set is not exhaustive, and is meant to give the user a feel for what general features are provided by the File Manager. Most likely the user will find that the File Manager provides many other capabilities besides those described here. - -Extension Points -We have constructed the File Manager making use of the factory method pattern to provide multiple extension points for the File Manager. An extension point is an interface within the File Manager that can have many implementations. This is particularly useful when it comes to software component configuration because it allows different implementations of an existing interface to be selected at deployment time. - -The factory method pattern is a creational pattern common to object oriented design. Each File Manager extension point involves the implementation of two interfaces: an extension factory and an extension implementation. At run-time, the File Manager loads a properties file specifies a factory class to use during extension point instantiation. For example, the File Manager may communicate with a database-based Catalog and an XML-based Element Store (called a Validation Layer), or it may use a Lucene-based Catalog and a database-based Validation Layer. -Using extension points, it is fairly simple to support many different types of what are typically referred to as "plug-in architectures." Each of the core extension points for the File Manager is described below: - -Catalog The Catalog extension point is responsible for storing all the instance data for Products, Metadata, and for file References. Additionally, the Catalog provides a query capability for Products. -Data Transfer The Data Transfer extension point allows for the movement of a Product to and from the archive managed by the File Manager component. Different protocols for Data Transfer may include local (disk-based) copy, or remote XML-RPC based transfer across networked machines. -Repository Manager The Repository Manager extension point provides a means for managing all of the policy information (i.e., the Product Types and their associated information) for Products managed by the File Manager. -Validation Layer The Validation Layer extension point allows for the querying of element definitions associated with a particular Product Type. The extension point also maps Product Type to Elements. -Versioning The Versioning extension point allows for the definition of different URI generation schemes that define the final resting location of files for a particular Product. -System The extension point that provides the external interface to the File Manager services. This includes the File Manager server interface, as well as the associated File Manager client interface, that communicates with the server. -Current Extension Point Implementations -There are at least two implementations of all of the aforementioned extension points for the File Manager. Each extension point implementation is detailed in this section. - -Catalog -Data Source based Catalog. An implementation of the Catalog extension point interface that uses a JDBC accessible database backend. -Lucene based Catalog. An implementation of the Catalog extension point interface that uses the Lucene free text index system to store Product instance information. -Data Transfer -Local Data Transfer. An implementation of the Data Transfer interface that uses Apache's commons-io to perform local, disk based filesystem data transfer. This implementation also supports locally accessible Network File System (NFS) disks. -Remote Data Transfer. An implementation of the Data Transfer interface that uses the XML-RPC File Manager client to transfer files to a remote XML-RPC File Manager server. -InPlace Data Transfer. An implementation of the Data Transfer interface that avoids transfering any products -- this can be used in the situation where metadata about a particular product should be recorded, but no physical transfer needs to occur. -Repository Manager -Data Source based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in a JDBC accessible database. -XML based Repository Manager. An implementation of the Repository Manager extension point that stores Product Type policy information in an XML file called product-types.xml -Validation Layer -Data Source based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in a JDBC accessible database. -XML based Validation Layer. An implementation of the Validation Layer extension point that stores Element policy information in 2 XML files called elements.xml and product-type-element-map.xml -System (File Manager client and File Manager server) -XML-RPC based File Manager server. An implementation of the external server interface for the File Manager that uses XML-RPC as the transportation medium. -XML-RPC based File Manager client. An implementation of the client interface for the XML-RPC File Manager server that uses XML-RPC as the transportation medium. -Use Cases -The File Manager was built to support several of the above capabilities outlined in Section 3. In particular there were several use cases that we wanted to support, some of which are described below. - -File Manager Ingest Use Case -The red numbers in the above Figure correspond to a sequence of steps that occurs and a series of interactions between the different File Manager extension points in order to perform the file ingestion activity. In Step 1, a File Manager client is invoked for the ingest operation, which sends Metadata and References for a particular Product to ingest to the File Manager server’s System Interface extension point. The System Interface uses the information about Product Type policy made available by the Repository Manager in order to understand whether or not the product should be transferred, where it’s root repository path should be, and so on. The System Interface then catalogs the file References and Metadata using the Catalog extension point. During this catalog process, the Catalog extension point uses the Validation Layer to determine which Elements should be extracted for the particular Product, based upon its Product Type. After that, Data Transfer is initiated either at the client or server end, and the first step to Data Transfer is using the Product’s associated Versioner to generate final file References. After final file References have been determined, the file data is transferred by the server or by the client, using the Data Transfer extension point. - -Conclusion -The aim of this document is to provide information relevant to developers about the CAS File Manager. Specifically, this document has described the File Manager's architecture, including its constituent components, object model and key capabilities. Additionally, the this document provides an overview of the current implementations of the File Manager's extension points. \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..3747179a2d57675498e0efa5ebe208bfc65af1a3 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Apache OODT File Manager/cas-filemgr \342\200\223 CAS File Manager Developer Guide.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt new file mode 100644 index 0000000000000000000000000000000000000000..4af331b79395fa7e7deaf99d67ab9990ab74f8b4 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt @@ -0,0 +1,75 @@ +HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW + + + +1 Module View +In this section we describe the modular structure of the HDFS, version 0.21. + +This structure is derived from a static analysis of the source code, specifically focusing on the dependencies between classes and groups of classes (modules). This structure is shown in the figure below. +[Module view] +1.1 Modularity Risks +With development of the software over time, all source code inevitably drifts away from it’s initial "as-designed" structure. This section identifies four signals--characteristics of the code--that suggest that the source code has evolved to a less modular structure: + +Module with a very weak dependency on another module: If a dependency from one module to another is very weak, it could indicate that the dependency is not well thought-out, but rather a technical debt incurred by a short-term expediency to easily implement a feature or fix a bug rather than adhering strictly to a principled architecture. +Module within a module: If a part of a package only depends on itself, and has no incoming dependencies from other classes in the package, this suggests that it is really a separate module. +Class(es) with more connections to another module than to its own: If a class or group of classes in a module has more dependencies (incoming or outgoing) to classes in another module than its own, this could be a sign that the class or group of classes could perhaps better be located in the other module, or perhaps in a separate, shared module. +Cyclic dependencies: A cyclic dependency occurs when two or more classes depend on each other either directly or indirectly. We consider cyclic dependencies a signal for refactoring opportunities, because they make it harder to understand, reuse and test the code. +When these signals are applied to the module structure, it appears that the HDFS could be made more modular. Each of these refactoring opportunities is now discussed, presented per module. + +1.1.1.1 hdfs +Because the hdfs package now contains code that is used by both the client and the server, the package hdfs should be split into two: hdfs.client and hdfs.common. The hdfs.common package can contain all code that is shared by both the client and server modules, while the client would contain just the code necessary for the client. This division could look as follows: + +hdfs.client + +hdfs.common + +BlockMissingException.java +DFSClient.java +DFSInputStream.java +DFSOutputStream.java +ByteRangeInputStream.java +DFSClient.java +HftpFileSystem.java +HsftpFileSystem.java + +BlockReader.java +DeprecatedUTF8.java +DFSConfigKeys.java +DFSUtil.java +DistributedFileSystem.java +HdfsConfiguration.java +HDFSPolicyProvider.java + + + + +Currently, the default port numbers that the NameNode and DataNode run with are stored in the server.namenode and server.datanode packages respectively. If they would be stored in hdfs.common instead, servers that want to communicate with either the namenode or datanode server would not need a dependency on that server's package. + +1.1.1.2 hfds.security +security.token.delegation.DelegationTokenSecretManager depends on server.namenode.FSNameSystem, while the security code is used by other servers than the namenode. This could be refactored so that FSNameSystem is called from the namenode rather than the security module, removing a dependency. + +1.1.1.3 hdfs.protocol +The class BlockListAsLongs depends on ReplicaInfo in the server.datanode module, which looks like an unhealthy dependency, given that hdfs.protocol is used by all servers rather than just the datanode server. Building the block list is a task that is better performed in the hdfs.server.datanode module. + +1.1.1.4 hdfs.server.protocol +The server.protocol package depends on two classes in server.common that protocol just uses for defined constants. It seems that it would be better to store these constants in the server.protocol package, as they (proven by their use in server.protocol) define the communication between servers. There are also dependencies from hdfs.server.protocol to hdfs.server.datanode (in protocol.DataNodeRegistration) and hdfs.server.namenode (in protocol.CheckpointCommand). These dependencies exist because hdfs.server.protocol contains code to fill its protocol messages from these classes. It would remove the dependencies from protocol on these classes if datanode and namenode themselves would be responsible for filling in the protocol messages. + +1.1.1.5 server.common +IncorrectVersionException and InconsistentFSStateException would probably fit better in server.protocol. JspHelper depends on namenode; the function that uses it (JspHelper.sortNodeList) can be moved to the namenode package, since it's not relevant for other servers. + +1.1.1.6 hdfs.server.namenode +server.namenode depends on hdfs.DFSClient to create servlets. It appears that this code could be refactored to be put into hdfs.common. The class namenode.FSNameSystem is involved in multiple cyclic dependencies. It has a direct cyclic dependency with namenode.NameNode, namenode.FSNameSystemMetrics and namenode.LeaseManager, and there are indirect cyclic dependencies on more classes (for example UpgradeObjectNamenode, UpgradeManagerNamenode. + +1.1.1.7 hdfs.server.datanode +server.datanode also depends on hdfs.DFSClient. Putting this code in common would be a good refactoring opportunity. + +1.1.1.8 hdfs.server.balancer +server.balancer also depends on hdfs.DFSClient, again input from the community is greatly appreciated on if putting this code in common would be a good refactoring opportunity. Another refactoring possibility for the balancer is to remove the dependency on the namenode. The classes from namenode that the balancer depends on are: + +namenode.UnsupportedActionException, which could be moved to protocol, since it's a shared message between namenode and balancer +namenode.Namenode, on which it only depends to get the namenode's port number, which could be stored in the common package. +namenode.BlockPlacementPolicy, on which it depends to check if the block placement policy of the balancer matches the policy of the namenode. This check could be done through a protocol message in server.protcol as well. +Removing the dependency on namenode would make balancer a fully separate server, and would allow it to perform at the same level as datanode and namenode. The class server.balancer.Balancer contains several cyclic dependencies, however, they are all within classes in the same source file. This means the effect of the dependencies is likely less severe, but refactoring the dependency structure of this class could still be an opportunity to increase modularity. + +1.1.1.9 hdfs.tools +tools consists of a couple of different components that have low coupling between them. But because they all provide functionality that falls somewhat outside the main domain of a filesystem (debugging and administrative tools), it makes sense to keep them together in one package for the user's convenience. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..81287be724e8c0cc41ffbf463f512afff968fb9a Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP DISTRIBUTED FILE SYSTEM (HDFS) ARCHITECTURAL DOCUMENTATION - MODULE VIEW.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt new file mode 100644 index 0000000000000000000000000000000000000000..a93754189bc23764f68bc0be9490e4ec30a43ba8 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt @@ -0,0 +1,139 @@ +HADOOP ECOSYSTEM +Hadoop Ecosystem is neither a programming language nor a service, it is a platform or framework which solves big data problems. You can consider it as a suite which encompasses a number of services (ingesting, storing, analyzing and maintaining) inside it. Below are the Hadoop components, that together form a Hadoop ecosystem. + HDFS -> Hadoop Distributed File System + YARN -> Yet Another Resource Negotiator + MapReduce -> Data processing using programming + Spark -> In-memory Data Processing + PIG, HIVE-> Data Processing Services using Query (SQL-like) + HBase -> NoSQL Database + Mahout, Spark MLlib -> Machine Learning + Apache Drill -> SQL on Hadoop + Zookeeper -> Managing Cluster + Oozie -> Job Scheduling + Flume, Sqoop -> Data Ingesting Services + Solr& Lucene -> Searching & Indexing + Ambari -> Provision, Monitor and Maintain cluster +HDFS + Hadoop Distributed File System is the core component or you can say, the backbone of Hadoop Ecosystem. + HDFS is the one, which makes it possible to store different types of large data sets (i.e. structured, unstructured and semi structured data). + HDFS creates a level of abstraction over the resources, from where we can see the whole HDFS as a single unit. + It helps us in storing our data across various nodes and maintaining the log file about the stored data (metadata). + HDFS has two core components, i.e. NameNode and DataNode. +1. The NameNode is the main node and it doesn’t store the actual data. It contains metadata, just like a log file or you can say as a table of content. Therefore, it requires less storage and high computational resources. +2. On the other hand, all your data is stored on the DataNodes and hence it requires more storage resources. These DataNodes are commodity hardware (like your laptops and desktops) in the distributed environment. That’s the reason, why Hadoop solutions are very cost effective. +3. You always communicate to the NameNode while writing the data. Then, it internally sends a request to the client to store and replicate data on various DataNodes. +YARN +Consider YARN as the brain of your Hadoop Ecosystem. It performs all your processing activities by allocating resources and scheduling tasks. + It has two major components, i.e. Resource Manager and Node Manager. +1. Resource Manager is again a main node in the processing department. +2. It receives the processing requests, and then passes the parts of requests to corresponding Node Managers accordingly, where the actual processing takes place. +3. Node Managers are installed on every Data Node. It is responsible for execution of task on every single Data Node. + +1. Schedulers: Based on your application resource requirements, Schedulers perform scheduling algorithms and allocates the resources. +2. Applications Manager: While Applications Manager accepts the job submission, negotiates to containers (i.e. the Data node environment where process executes) for executing the application specific Application Master and monitoring the progress. ApplicationMasters are the deamons which reside on DataNode and communicates to containers for execution of tasks on each DataNode. +3. ResourceManager has two components: Schedulers and application manager +MAPREDUCE +It is the core component of processing in a Hadoop Ecosystem as it provides the logic of processing. In other words, MapReduce is a software framework which helps in writing applications that processes large data sets using distributed and parallel algorithms inside Hadoop environment. + In a MapReduce program, Map() and Reduce() are two functions. +1. The Map function performs actions like filtering, grouping and sorting. +2. While Reduce function aggregates and summarizes the result produced by map function. +3. The result generated by the Map function is a key value pair (K, V) which acts as the input for Reduce function. +Let us take the above example to have a better understanding of a MapReduce program. We have a sample case of students and their respective departments. We want to calculate the number of students in each department. Initially, Map program will execute and calculate the students appearing in each department, producing the key value pair as mentioned above. This key value pair is the input to the Reduce function. The Reduce function will then aggregate each department and calculate the total number of students in each department and produce the given result. +APACHE PIG + PIG has two parts: Pig Latin, the language and the pig runtime, for the execution environment. You can better understand it as Java and JVM. + It supports pig latin language, which has SQL like command structure. +10 line of pig latin = approx. 200 lines of Map-Reduce Java code But don’t be shocked when I say that at the back end of Pig job, a map-reduce job executes. + The compiler internally converts pig latin to MapReduce. It produces a sequential set of MapReduce jobs, and that’s an abstraction (which works like black box). + PIG was initially developed by Yahoo. + It gives you a platform for building data flow for ETL (Extract, Transform and Load), processing and analyzing huge data sets. +How Pig works? In PIG, first the load command, loads the data. Then we perform various functions on it like grouping, filtering, joining, sorting, etc. At last, either you can dump the data on the screen or you can store the result back in HDFS. +APACHE HIVE + Facebook created HIVE for people who are fluent with SQL. Thus, HIVE makes them feel at home while working in a Hadoop Ecosystem. + Basically, HIVE is a data warehousing component which performs reading, writing and managing large data sets in a distributed environment using SQL-like interface. +HIVE + SQL = HQL + The query language of Hive is called Hive Query Language(HQL), which is very similar like SQL. + It has 2 basic components: Hive Command Line and JDBC/ODBC driver. + The Hive Command line interface is used to execute HQL commands. + While, Java Database Connectivity (JDBC) and Object Database Connectivity (ODBC) is used to establish connection from data storage. + Secondly, Hive is highly scalable. As, it can serve both the purposes, i.e. large data set processing (i.e. Batch query processing) and real time processing (i.e. Interactive query processing). + It supports all primitive data types of SQL. + You can use predefined functions, or write tailored user defined functions (UDF) also to accomplish your specific needs. +APACHE MAHOUT +Now, let us talk about Mahout which is renowned for machine learning. Mahout provides an environment for creating machine learning applications which are scalable. Machine learning algorithms allow us to build self-learning machines that evolve by itself without being explicitly programmed. Based on user behaviour, data patterns and past experiences it makes important future decisions. You can call it a descendant of Artificial Intelligence (AI). What Mahout does? It performs collaborative filtering, clustering and classification. Some people also consider frequent item set missing as Mahout’s function. Let us understand them individually: +1. Collaborative filtering: Mahout mines user behaviors, their patterns and their characteristics and based on that it predicts and make recommendations to the users. The typical use case is E-commerce website. +2. Clustering: It organizes a similar group of data together like articles can contain blogs, news, research papers etc. +3. Classification: It means classifying and categorizing data into various sub-departments like articles can be categorized into blogs, news, essay, research papers and other categories. +4. Frequent item set missing: Here Mahout checks, which objects are likely to be appearing together and make suggestions, if they are missing. For example, cell phone and cover are brought together in general. So, if you search for a cell phone, it will also recommend you the cover and cases. +Mahout provides a command line to invoke various algorithms. It has a predefined set of library which already contains different inbuilt algorithms for different use cases. +APACHE SPARK + Apache Spark is a framework for real time data analytics in a distributed computing environment. + The Spark is written in Scala and was originally developed at the University of California, Berkeley. + It executes in-memory computations to increase speed of data processing over Map-Reduce. + It is 100x faster than Hadoop for large scale data processing by exploiting in-memory computations and other optimizations. Therefore, it requires high processing power than Map-Reduce. +As you can see, Spark comes packed with high-level libraries, including support for R, SQL, Python, Scala, Java etc. These standard libraries increase the seamless integrations in complex workflow. Over this, it also allows various sets of services to integrate with it like MLlib, GraphX, SQL + Data Frames, Streaming services etc. to increase its capabilities. . Apache Spark best fits for real time processing, whereas Hadoop was designed to store unstructured data and execute batch processing over it. When we combine, Apache Spark’s ability, i.e. high processing speed, advance analytics and multiple integration support with Hadoop’s low cost operation on commodity hardware, it gives the best results. +That is the reason why, Spark and Hadoop are used together by many companies for processing and analyzing their Big Data stored in HDFS. +APACHE HBASE + HBase is an open source, non-relational distributed database. In other words, it is a NoSQL database. + It supports all types of data and that is why, it’s capable of handling anything and everything inside a Hadoop ecosystem. + It is modelled after Google’s BigTable, which is a distributed storage system designed to cope up with large data sets. + The HBase was designed to run on top of HDFS and provides BigTable like capabilities. + It gives us a fault tolerant way of storing sparse data, which is common in most Big Data use cases. + The HBase is written in Java, whereas HBase applications can be written in REST, Avro and Thrift APIs. +For better understanding, let us take an example. You have billions of customer emails and you need to find out the number of customers who has used the word complaint in their emails. The request needs to be processed quickly (i.e. at real time). So, here we are handling a large data set while retrieving a small amount of data. For solving these kind of problems, HBase was designed. +APACHE DRILL +Apache Drill is used to drill into any kind of data. It’s an open source application which works with distributed environment to analyze large data sets. + It is a replica of Google Dremel. + It supports different kinds NoSQL databases and file systems, which is a powerful feature of Drill. For example: Azure Blob Storage, Google Cloud Storage, HBase, MongoDB, MapR-DB HDFS, MapR-FS, Amazon S3, Swift, NAS and local files. +So, basically the main aim behind Apache Drill is to provide scalability so that we can process petabytes and exabytes of data efficiently (or you can say in minutes). + The main power of Apache Drill lies in combining a variety of data stores just by using a single query. + Apache Drill basically follows the ANSI SQL. + It has a powerful scalability factor in supporting millions of users and serve their query requests over large scale data. +APACHE ZOOKEEPER + Apache Zookeeper is the coordinator of any Hadoop job which includes a combination of various services in a Hadoop Ecosystem. + Apache Zookeeper coordinates with various services in a distributed environment. +Before Zookeeper, it was very difficult and time consuming to coordinate between different services in Hadoop Ecosystem. The services earlier had many problems with interactions like common configuration while synchronizing data. Even if the services are configured, changes in the configurations of the services make it complex and difficult to handle. The grouping and naming was also a time-consuming factor. Due to the above problems, Zookeeper was introduced. It saves a lot of time by performing synchronization, configuration maintenance, grouping and naming. +Although it’s a simple service, it can be used to build powerful solutions. +APACHE OOZIE +Consider Apache Oozie as a clock and alarm service inside Hadoop Ecosystem. For Apache jobs, Oozie has been just like a scheduler. It schedules Hadoop jobs and binds them together as one logical work. There are two kinds of Oozie jobs: +1. Oozie workflow: These are sequential set of actions to be executed. You can assume it as a relay race. Where each athlete waits for the last one to complete his part. +2. Oozie Coordinator: These are the Oozie jobs which are triggered when the data is made available to it. Think of this as the response-stimuli system in our body. In the same manner as we respond to an external stimulus, an Oozie coordinator responds to the availability of data and it rests otherwise. +APACHE FLUME +Ingesting data is an important part of our Hadoop Ecosystem. + The Flume is a service which helps in ingesting unstructured and semi-structured data into HDFS. + It gives us a solution which is reliable and distributed and helps us in collecting, aggregating and moving large amount of data sets. + It helps us to ingest online streaming data from various sources like network traffic, social media, email messages, log files etc. in HDFS. +Now, let us understand the architecture of Flume from the below diagram: +There is a Flume agent which ingests the streaming data from various data sources to HDFS. From the diagram, you can easily understand that the web server indicates the data source. Twitter is among one of the famous sources for streaming data. The flume agent has 3 components: source, sink and channel. +1. Source: it accepts the data from the incoming streamline and stores the data in the channel. +2. Channel: it acts as the local storage or the primary storage. A Channel is a temporary storage between the source of data and persistent data in the HDFS. +3. Sink: Then, our last component i.e. Sink, collects the data from the channel and commits or writes the data in the HDFS permanently. +APACHE SQOOP +The major difference between Flume and Sqoop is that: + Flume only ingests unstructured data or semi-structured data into HDFS. + While Sqoop can import as well as export structured data from RDBMS or Enterprise data warehouses to HDFS or vice versa. +Let us understand how Sqoop works using the below diagram: +When we submit Sqoop command, our main task gets divided into sub tasks which is handled by individual Map Task internally. Map Task is the sub task, which imports part of data to the Hadoop Ecosystem. Collectively, all Map tasks imports the whole data. +Export also works in a similar manner. +When we submit our Job, it is mapped into Map Tasks which brings the chunk of data from HDFS. These chunks are exported to a structured data destination. Combining all these exported chunks of data, we receive the whole data at the destination, which in most of the cases is an RDBMS (MYSQL/Oracle/SQL Server). +APACHE SOLR & LUCENE +Apache Solr and Apache Lucene are the two services which are used for searching and indexing in Hadoop Ecosystem. + Apache Lucene is based on Java, which also helps in spell checking. + If Apache Lucene is the engine, Apache Solr is the car built around it. Solr is a complete application built around Lucene. + It uses the Lucene Java search library as a core for search and full indexing. +APACHE AMBARI +Ambari is an Apache Software Foundation Project which aims at making Hadoop ecosystem more manageable. +Big Data Hadoop Certification Training +Weekday / Weekend Batc +It includes software for provisioning, managing and monitoring Apache Hadoop clusters. The Ambari provides: +1. Hadoop cluster provisioning: + It gives us step by step process for installing Hadoop services across a number of hosts. + It also handles configuration of Hadoop services over a cluster. +2. Hadoop cluster management: + It provides a central management service for starting, stopping and re-configuring Hadoop services across the cluster. +3. Hadoop cluster monitoring: + For monitoring health and status, Ambari provides us a dashboard. + The Amber Alert framework is an alerting service which notifies the user, whenever the attention is needed. For example, if a node goes down or low disk space on a node, etc. +At last, I would like to draw your attention on three things importantly: +1. Hadoop Ecosystem owes its success to the whole developer community, many big companies like Facebook, Google, Yahoo, University of California (Berkeley) etc. have contributed their part to increase Hadoop’s capabilities. +2. Inside a Hadoop Ecosystem, knowledge about one or two tools (Hadoop components) would not help in building a solution. You need to learn a set of Hadoop components, which works together to build a solution. +3. Based on the use cases, we can choose a set of services from Hadoop Ecosystem and create a tailored solution for an organization. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..3914f2d5b9626018aa7d1e2e1c9c5e9af24a4580 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/HADOOP ECOSYSTEM.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt new file mode 100644 index 0000000000000000000000000000000000000000..811d08e8ccf6949edc37095172838b4580ae058d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt @@ -0,0 +1,172 @@ +HDFS Architecture Guide +Introduction +Assumptions and Goals +Hardware Failure +Streaming Data Access +Large Data Sets +Simple Coherency Model +“Moving Computation is Cheaper than Moving Data” +Portability Across Heterogeneous Hardware and Software Platforms +NameNode and DataNodes +The File System Namespace +Data Replication +Replica Placement: The First Baby Steps +Replica Selection +Safemode +The Persistence of File System Metadata +The Communication Protocols +Robustness +Data Disk Failure, Heartbeats and Re-Replication +Cluster Rebalancing +Data Integrity +Metadata Disk Failure +Snapshots +Data Organization +Data Blocks +Staging +Replication Pipelining +Accessibility +FS Shell +DFSAdmin +Browser Interface +Space Reclamation +File Deletes and Undeletes +Decrease Replication Factor +References +Introduction +The Hadoop Distributed File System (HDFS) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file system data. HDFS was originally built as infrastructure for the Apache Nutch web search engine project. HDFS is now an Apache Hadoop subproject. The project URL is https://hadoop.apache.org/hdfs/. + +Assumptions and Goals +Hardware Failure +Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, each storing part of the file system’s data. The fact that there are a huge number of components and that each component has a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, automatic recovery from them is a core architectural goal of HDFS. + +Streaming Data Access +Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. + +Large Data Sets +Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support tens of millions of files in a single instance. + +Simple Coherency Model +HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed. This assumption simplifies data coherency issues and enables high throughput data access. A MapReduce application or a web crawler application fits perfectly with this model. There is a plan to support appending-writes to files in the future. + +“Moving Computation is Cheaper than Moving Data” +A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. + +Portability Across Heterogeneous Hardware and Software Platforms +HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a platform of choice for a large set of applications. + +NameNode and DataNodes +HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file system’s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode. + +HDFS Architecture +The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a GNU/Linux operating system (OS). HDFS is built using the Java language; any machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case. + +The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode. + +The File System Namespace +HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and remove files, move a file from one directory to another, or rename a file. HDFS does not yet implement user quotas. HDFS does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features. + +The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode. + +Data Replication +HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence of blocks; all blocks in a file except the last block are the same size. The blocks of a file are replicated for fault tolerance. The block size and replication factor are configurable per file. An application can specify the number of replicas of a file. The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once and have strictly one writer at any time. + +The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A Blockreport contains a list of all blocks on a DataNode. + +HDFS DataNodes +Replica Placement: The First Baby Steps +The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation to test and research more sophisticated policies. + +Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines in the same rack is greater than network bandwidth between machines in different racks. + +The NameNode determines the rack id each DataNode belongs to via the process outlined in Hadoop Rack Awareness. A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of writes because a write needs to transfer blocks to multiple racks. + +For the common case, when the replication factor is three, HDFS’s placement policy is to put one replica on one node in the local rack, another on a node in a different (remote) rack, and the last on a different node in the same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability guarantees. However, it does reduce the aggregate network bandwidth used when reading data since a block is placed in only two unique racks rather than three. With this policy, the replicas of a file do not evenly distribute across the racks. One third of replicas are on one node, two thirds of replicas are on one rack, and the other third are evenly distributed across the remaining racks. This policy improves write performance without compromising data reliability or read performance. + +The current, default replica placement policy described here is a work in progress. + +Replica Selection +To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is preferred to satisfy the read request. If angg/ HDFS cluster spans multiple data centers, then a replica that is resident in the local data center is preferred over any remote replica. + +Safemode +On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block has a specified minimum number of replicas. A block is considered safely replicated when the minimum number of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified number of replicas. The NameNode then replicates these blocks to other DataNodes. + +The Persistence of File System Metadata +The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog to persistently record every change that occurs to file system metadata. For example, creating a new file in HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as a file in the NameNode’s local file system too. + +The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. This key metadata item is designed to be compact, such that a NameNode with 4 GB of RAM is plenty to support a huge number of files and directories. When the NameNode starts up, it reads the FsImage and EditLog from disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This process is called a checkpoint. In the current implementation, a checkpoint only occurs when the NameNode starts up. Work is in progress to support periodic checkpointing in the near future. + +The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files and sends this report to the NameNode: this is the Blockreport. + +The Communication Protocols +All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to a configurable TCP port on the NameNode machine. It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. A Remote Procedure Call (RPC) abstraction wraps both the Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only responds to RPC requests issued by DataNodes or clients. + +Robustness +The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types of failures are NameNode failures, DataNode failures and network partitions. + +Data Disk Failure, Heartbeats and Re-Replication +Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and does not forward any new IO requests to them. Any data that was registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a DataNode may fail, or the replication factor of a file may be increased. + +Cluster Rebalancing +The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. + +Data Integrity +It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur because of faults in a storage device, network faults, or buggy software. The HDFS client software implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, it computes a checksum of each block of the file and stores these checksums in a separate hidden file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it received from each DataNode matches the checksum stored in the associated checksum file. If not, then the client can opt to retrieve that block from another DataNode that has a replica of that block. + +Metadata Disk Failure +The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of namespace transactions per second that a NameNode can support. However, this degradation is acceptable because even though HDFS applications are very data intensive in nature, they are not metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use. + +The NameNode machine is a single point of failure for an HDFS cluster. If the NameNode machine fails, manual intervention is necessary. Currently, automatic restart and failover of the NameNode software to another machine is not supported. + +Snapshots +Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot feature may be to roll back a corrupted HDFS instance to a previously known good point in time. HDFS does not currently support snapshots but will in a future release. + +Data Organization +Data Blocks +HDFS is designed to support very large files. Applications that are compatible with HDFS are those that deal with large data sets. These applications write their data only once but they read it one or more times and require these reads to be satisfied at streaming speeds. HDFS supports write-once-read-many semantics on files. A typical block size used by HDFS is 64 MB. Thus, an HDFS file is chopped up into 64 MB chunks, and if possible, each chunk will reside on a different DataNode. + +Staging +A client request to create a file does not reach the NameNode immediately. In fact, initially the HDFS client caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy and allocates a data block for it. The NameNode responds to the client request with the identity of the DataNode and the destination data block. Then the client flushes the block of data from the local temporary file to the specified DataNode. When a file is closed, the remaining un-flushed data in the temporary local file is transferred to the DataNode. The client then tells the NameNode that the file is closed. At this point, the NameNode commits the file creation operation into a persistent store. If the NameNode dies before the file is closed, the file is lost. + +The above approach has been adopted after careful consideration of target applications that run on HDFS. These applications need streaming writes to files. If a client writes to a remote file directly without any client side buffering, the network speed and the congestion in the network impacts throughput considerably. This approach is not without precedent. Earlier distributed file systems, e.g. AFS, have used client side caching to improve performance. A POSIX requirement has been relaxed to achieve higher performance of data uploads. + +Replication Pipelining +When a client is writing data to an HDFS file, its data is first written to a local file as explained in the previous section. Suppose the HDFS file has a replication factor of three. When the local file accumulates a full block of user data, the client retrieves a list of DataNodes from the NameNode. This list contains the DataNodes that will host a replica of that block. The client then flushes the data block to the first DataNode. The first DataNode starts receiving the data in small portions (4 KB), writes each portion to its local repository and transfers that portion to the second DataNode in the list. The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from one DataNode to the next. + +Accessibility +HDFS can be accessed from applications in many different ways. Natively, HDFS provides a Java API for applications to use. A C language wrapper for this Java API is also available. In addition, an HTTP browser can also be used to browse the files of an HDFS instance. Work is in progress to expose HDFS through the WebDAV protocol. + +FS Shell +HDFS allows user data to be organized in the form of files and directories. It provides a commandline interface called FS shell that lets a user interact with the data in HDFS. The syntax of this command set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample action/command pairs: + +Action Command +Create a directory named /foodir bin/hadoop dfs -mkdir /foodir +Remove a directory named /foodir bin/hadoop dfs -rmr /foodir +View the contents of a file named /foodir/myfile.txt bin/hadoop dfs -cat /foodir/myfile.txt +FS shell is targeted for applications that need a scripting language to interact with the stored data. + +DFSAdmin +The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are used only by an HDFS administrator. Here are some sample action/command pairs: + +Action Command +Put the cluster in Safemode bin/hadoop dfsadmin -safemode enter +Generate a list of DataNodes bin/hadoop dfsadmin -report +Recommission or decommission DataNode(s) bin/hadoop dfsadmin -refreshNodes +Browser Interface +A typical HDFS install configures a web server to expose the HDFS namespace through a configurable TCP port. This allows a user to navigate the HDFS namespace and view the contents of its files using a web browser. + +Space Reclamation +File Deletes and Undeletes +When a file is deleted by a user or an application, it is not immediately removed from HDFS. Instead, HDFS first renames it to a file in the /trash directory. The file can be restored quickly as long as it remains in /trash. A file remains in /trash for a configurable amount of time. After the expiry of its life in /trash, the NameNode deletes the file from the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. Note that there could be an appreciable time delay between the time a file is deleted by a user and the time of the corresponding increase in free space in HDFS. + +A user can Undelete a file after deleting it as long as it remains in the /trash directory. If a user wants to undelete a file that he/she has deleted, he/she can navigate the /trash directory and retrieve the file. The /trash directory contains only the latest copy of the file that was deleted. The /trash directory is just like any other directory with one special feature: HDFS applies specified policies to automatically delete files from this directory. The current default policy is to delete files from /trash that are more than 6 hours old. In the future, this policy will be configurable through a well defined interface. + +Decrease Replication Factor +When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay between the completion of the setReplication API call and the appearance of free space in the cluster. + +References +HDFS Java API: https://hadoop.apache.org/core/docs/current/api/ + +HDFS source code: https://hadoop.apache.org/hdfs/version_control.html + +by Dhruba Borthakur \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..5d0278ae9431acf4a92aca1f2c1e0801518a91ea Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture Guide.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt new file mode 100644 index 0000000000000000000000000000000000000000..c812cce02a7b16e36284136575d0f9b8172d71ea --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt @@ -0,0 +1,359 @@ +General +Overview +Single Node Setup +Cluster Setup +Commands Reference +FileSystem Shell +Compatibility Specification +Downstream Developer's Guide +Admin Compatibility Guide +Interface Classification +FileSystem Specification +Common +CLI Mini Cluster +Fair Call Queue +Native Libraries +Proxy User +Rack Awareness +Secure Mode +Service Level Authorization +HTTP Authentication +Credential Provider API +Hadoop KMS +Tracing +Unix Shell Guide +Registry +HDFS +Architecture +User Guide +Commands Reference +NameNode HA With QJM +NameNode HA With NFS +Observer NameNode +Federation +ViewFs +ViewFsOverloadScheme +Snapshots +Edits Viewer +Image Viewer +Permissions and HDFS +Quotas and HDFS +libhdfs (C API) +WebHDFS (REST API) +HttpFS +Short Circuit Local Reads +Centralized Cache Management +NFS Gateway +Rolling Upgrade +Extended Attributes +Transparent Encryption +Multihoming +Storage Policies +Memory Storage Support +Synthetic Load Generator +Erasure Coding +Disk Balancer +Upgrade Domain +DataNode Admin +Router Federation +Provided Storage +MapReduce +Tutorial +Commands Reference +Compatibility with 1.x +Encrypted Shuffle +Pluggable Shuffle/Sort +Distributed Cache Deploy +Support for YARN Shared Cache +MapReduce REST APIs +MR Application Master +MR History Server +YARN +Architecture +Commands Reference +Capacity Scheduler +Fair Scheduler +ResourceManager Restart +ResourceManager HA +Resource Model +Node Labels +Node Attributes +Web Application Proxy +Timeline Server +Timeline Service V.2 +Writing YARN Applications +YARN Application Security +NodeManager +Running Applications in Docker Containers +Running Applications in runC Containers +Using CGroups +Secure Containers +Reservation System +Graceful Decommission +Opportunistic Containers +YARN Federation +Shared Cache +Using GPU +Using FPGA +Placement Constraints +YARN UI2 +YARN REST APIs +Introduction +Resource Manager +Node Manager +Timeline Server +Timeline Service V.2 +YARN Service +Overview +QuickStart +Concepts +Yarn Service API +Service Discovery +System Services +Hadoop Compatible File Systems +Aliyun OSS +Amazon S3 +Azure Blob Storage +Azure Data Lake Storage +OpenStack Swift +Tencent COS +Auth +Overview +Examples +Configuration +Building +Tools +Hadoop Streaming +Hadoop Archives +Hadoop Archive Logs +DistCp +GridMix +Rumen +Resource Estimator Service +Scheduler Load Simulator +Hadoop Benchmarking +Dynamometer +Reference +Changelog and Release Notes +Java API docs +Unix Shell API +Metrics +Configuration +core-default.xml +hdfs-default.xml +hdfs-rbf-default.xml +mapred-default.xml +yarn-default.xml +kms-default.xml +httpfs-default.xml +Deprecated Properties +Built by Maven +HDFS Architecture +Introduction +Assumptions and Goals +Hardware Failure +Streaming Data Access +Large Data Sets +Simple Coherency Model +“Moving Computation is Cheaper than Moving Data” +Portability Across Heterogeneous Hardware and Software Platforms +NameNode and DataNodes +The File System Namespace +Data Replication +Replica Placement: The First Baby Steps +Replica Selection +Block Placement Policies +Safemode +The Persistence of File System Metadata +The Communication Protocols +Robustness +Data Disk Failure, Heartbeats and Re-Replication +Cluster Rebalancing +Data Integrity +Metadata Disk Failure +Snapshots +Data Organization +Data Blocks +Replication Pipelining +Accessibility +FS Shell +DFSAdmin +Browser Interface +Space Reclamation +File Deletes and Undeletes +Decrease Replication Factor +References +Introduction +The Hadoop Distributed File System (HDFS) is a distributed file system designed to run on commodity hardware. It has many similarities with existing distributed file systems. However, the differences from other distributed file systems are significant. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware. HDFS provides high throughput access to application data and is suitable for applications that have large data sets. HDFS relaxes a few POSIX requirements to enable streaming access to file system data. HDFS was originally built as infrastructure for the Apache Nutch web search engine project. HDFS is part of the Apache Hadoop Core project. The project URL is http://hadoop.apache.org/. + +Assumptions and Goals +Hardware Failure +Hardware failure is the norm rather than the exception. An HDFS instance may consist of hundreds or thousands of server machines, each storing part of the file system’s data. The fact that there are a huge number of components and that each component has a non-trivial probability of failure means that some component of HDFS is always non-functional. Therefore, detection of faults and quick, automatic recovery from them is a core architectural goal of HDFS. + +Streaming Data Access +Applications that run on HDFS need streaming access to their data sets. They are not general purpose applications that typically run on general purpose file systems. HDFS is designed more for batch processing rather than interactive use by users. The emphasis is on high throughput of data access rather than low latency of data access. POSIX imposes many hard requirements that are not needed for applications that are targeted for HDFS. POSIX semantics in a few key areas has been traded to increase data throughput rates. + +Large Data Sets +Applications that run on HDFS have large data sets. A typical file in HDFS is gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide high aggregate data bandwidth and scale to hundreds of nodes in a single cluster. It should support tens of millions of files in a single instance. + +Simple Coherency Model +HDFS applications need a write-once-read-many access model for files. A file once created, written, and closed need not be changed except for appends and truncates. Appending the content to the end of the files is supported but cannot be updated at arbitrary point. This assumption simplifies data coherency issues and enables high throughput data access. A MapReduce application or a web crawler application fits perfectly with this model. + +“Moving Computation is Cheaper than Moving Data” +A computation requested by an application is much more efficient if it is executed near the data it operates on. This is especially true when the size of the data set is huge. This minimizes network congestion and increases the overall throughput of the system. The assumption is that it is often better to migrate the computation closer to where the data is located rather than moving the data to where the application is running. HDFS provides interfaces for applications to move themselves closer to where the data is located. + +Portability Across Heterogeneous Hardware and Software Platforms +HDFS has been designed to be easily portable from one platform to another. This facilitates widespread adoption of HDFS as a platform of choice for a large set of applications. + +NameNode and DataNodes +HDFS has a master/slave architecture. An HDFS cluster consists of a single NameNode, a master server that manages the file system namespace and regulates access to files by clients. In addition, there are a number of DataNodes, usually one per node in the cluster, which manage storage attached to the nodes that they run on. HDFS exposes a file system namespace and allows user data to be stored in files. Internally, a file is split into one or more blocks and these blocks are stored in a set of DataNodes. The NameNode executes file system namespace operations like opening, closing, and renaming files and directories. It also determines the mapping of blocks to DataNodes. The DataNodes are responsible for serving read and write requests from the file system’s clients. The DataNodes also perform block creation, deletion, and replication upon instruction from the NameNode. + +HDFS Architecture + +The NameNode and DataNode are pieces of software designed to run on commodity machines. These machines typically run a GNU/Linux operating system (OS). HDFS is built using the Java language; any machine that supports Java can run the NameNode or the DataNode software. Usage of the highly portable Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the DataNode software. The architecture does not preclude running multiple DataNodes on the same machine but in a real deployment that is rarely the case. + +The existence of a single NameNode in a cluster greatly simplifies the architecture of the system. The NameNode is the arbitrator and repository for all HDFS metadata. The system is designed in such a way that user data never flows through the NameNode. + +The File System Namespace +HDFS supports a traditional hierarchical file organization. A user or an application can create directories and store files inside these directories. The file system namespace hierarchy is similar to most other existing file systems; one can create and remove files, move a file from one directory to another, or rename a file. HDFS supports user quotas and access permissions. HDFS does not support hard links or soft links. However, the HDFS architecture does not preclude implementing these features. + +While HDFS follows naming convention of the FileSystem, some paths and names (e.g. /.reserved and .snapshot ) are reserved. Features such as transparent encryption and snapshot use reserved paths. + +The NameNode maintains the file system namespace. Any change to the file system namespace or its properties is recorded by the NameNode. An application can specify the number of replicas of a file that should be maintained by HDFS. The number of copies of a file is called the replication factor of that file. This information is stored by the NameNode. + +Data Replication +HDFS is designed to reliably store very large files across machines in a large cluster. It stores each file as a sequence of blocks. The blocks of a file are replicated for fault tolerance. The block size and replication factor are configurable per file. + +All blocks in a file except the last block are the same size, while users can start a new block without filling out the last block to the configured block size after the support for variable length block was added to append and hsync. + +An application can specify the number of replicas of a file. The replication factor can be specified at file creation time and can be changed later. Files in HDFS are write-once (except for appends and truncates) and have strictly one writer at any time. + +The NameNode makes all decisions regarding replication of blocks. It periodically receives a Heartbeat and a Blockreport from each of the DataNodes in the cluster. Receipt of a Heartbeat implies that the DataNode is functioning properly. A Blockreport contains a list of all blocks on a DataNode. + +HDFS DataNodes + +Replica Placement: The First Baby Steps +The placement of replicas is critical to HDFS reliability and performance. Optimizing replica placement distinguishes HDFS from most other distributed file systems. This is a feature that needs lots of tuning and experience. The purpose of a rack-aware replica placement policy is to improve data reliability, availability, and network bandwidth utilization. The current implementation for the replica placement policy is a first effort in this direction. The short-term goals of implementing this policy are to validate it on production systems, learn more about its behavior, and build a foundation to test and research more sophisticated policies. + +Large HDFS instances run on a cluster of computers that commonly spread across many racks. Communication between two nodes in different racks has to go through switches. In most cases, network bandwidth between machines in the same rack is greater than network bandwidth between machines in different racks. + +The NameNode determines the rack id each DataNode belongs to via the process outlined in Hadoop Rack Awareness. A simple but non-optimal policy is to place replicas on unique racks. This prevents losing data when an entire rack fails and allows use of bandwidth from multiple racks when reading data. This policy evenly distributes replicas in the cluster which makes it easy to balance load on component failure. However, this policy increases the cost of writes because a write needs to transfer blocks to multiple racks. + +For the common case, when the replication factor is three, HDFS’s placement policy is to put one replica on the local machine if the writer is on a datanode, otherwise on a random datanode in the same rack as that of the writer, another replica on a node in a different (remote) rack, and the last on a different node in the same remote rack. This policy cuts the inter-rack write traffic which generally improves write performance. The chance of rack failure is far less than that of node failure; this policy does not impact data reliability and availability guarantees. However, it does not reduce the aggregate network bandwidth used when reading data since a block is placed in only two unique racks rather than three. With this policy, the replicas of a block do not evenly distribute across the racks. Two replicas are on different nodes of one rack and the remaining replica is on a node of one of the other racks. This policy improves write performance without compromising data reliability or read performance. + +If the replication factor is greater than 3, the placement of the 4th and following replicas are determined randomly while keeping the number of replicas per rack below the upper limit (which is basically (replicas - 1) / racks + 2). + +Because the NameNode does not allow DataNodes to have multiple replicas of the same block, maximum number of replicas created is the total number of DataNodes at that time. + +After the support for Storage Types and Storage Policies was added to HDFS, the NameNode takes the policy into account for replica placement in addition to the rack awareness described above. The NameNode chooses nodes based on rack awareness at first, then checks that the candidate node have storage required by the policy associated with the file. If the candidate node does not have the storage type, the NameNode looks for another node. If enough nodes to place replicas can not be found in the first path, the NameNode looks for nodes having fallback storage types in the second path. + +The current, default replica placement policy described here is a work in progress. + +Replica Selection +To minimize global bandwidth consumption and read latency, HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists a replica on the same rack as the reader node, then that replica is preferred to satisfy the read request. If HDFS cluster spans multiple data centers, then a replica that is resident in the local data center is preferred over any remote replica. + +Block Placement Policies +As mentioned above when the replication factor is three, HDFS’s placement policy is to put one replica on the local machine if the writer is on a datanode, otherwise on a random datanode in the same rack as that of the writer, another replica on a node in a different (remote) rack, and the last on a different node in the same remote rack. If the replication factor is greater than 3, the placement of the 4th and following replicas are determined randomly while keeping the number of replicas per rack below the upper limit (which is basically (replicas - 1) / racks + 2). Additional to this HDFS supports 4 different pluggable Block Placement Policies. Users can choose the policy based on their infrastructre and use case. By default HDFS supports BlockPlacementPolicyDefault. + +Safemode +On startup, the NameNode enters a special state called Safemode. Replication of data blocks does not occur when the NameNode is in the Safemode state. The NameNode receives Heartbeat and Blockreport messages from the DataNodes. A Blockreport contains the list of data blocks that a DataNode is hosting. Each block has a specified minimum number of replicas. A block is considered safely replicated when the minimum number of replicas of that data block has checked in with the NameNode. After a configurable percentage of safely replicated data blocks checks in with the NameNode (plus an additional 30 seconds), the NameNode exits the Safemode state. It then determines the list of data blocks (if any) that still have fewer than the specified number of replicas. The NameNode then replicates these blocks to other DataNodes. + +The Persistence of File System Metadata +The HDFS namespace is stored by the NameNode. The NameNode uses a transaction log called the EditLog to persistently record every change that occurs to file system metadata. For example, creating a new file in HDFS causes the NameNode to insert a record into the EditLog indicating this. Similarly, changing the replication factor of a file causes a new record to be inserted into the EditLog. The NameNode uses a file in its local host OS file system to store the EditLog. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the FsImage. The FsImage is stored as a file in the NameNode’s local file system too. + +The NameNode keeps an image of the entire file system namespace and file Blockmap in memory. When the NameNode starts up, or a checkpoint is triggered by a configurable threshold, it reads the FsImage and EditLog from disk, applies all the transactions from the EditLog to the in-memory representation of the FsImage, and flushes out this new version into a new FsImage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This process is called a checkpoint. The purpose of a checkpoint is to make sure that HDFS has a consistent view of the file system metadata by taking a snapshot of the file system metadata and saving it to FsImage. Even though it is efficient to read a FsImage, it is not efficient to make incremental edits directly to a FsImage. Instead of modifying FsImage for each edit, we persist the edits in the Editlog. During the checkpoint the changes from Editlog are applied to the FsImage. A checkpoint can be triggered at a given time interval (dfs.namenode.checkpoint.period) expressed in seconds, or after a given number of filesystem transactions have accumulated (dfs.namenode.checkpoint.txns). If both of these properties are set, the first threshold to be reached triggers a checkpoint. + +The DataNode stores HDFS data in files in its local file system. The DataNode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The DataNode does not create all files in the same directory. Instead, it uses a heuristic to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a DataNode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files, and sends this report to the NameNode. The report is called the Blockreport. + +The Communication Protocols +All HDFS communication protocols are layered on top of the TCP/IP protocol. A client establishes a connection to a configurable TCP port on the NameNode machine. It talks the ClientProtocol with the NameNode. The DataNodes talk to the NameNode using the DataNode Protocol. A Remote Procedure Call (RPC) abstraction wraps both the Client Protocol and the DataNode Protocol. By design, the NameNode never initiates any RPCs. Instead, it only responds to RPC requests issued by DataNodes or clients. + +Robustness +The primary objective of HDFS is to store data reliably even in the presence of failures. The three common types of failures are NameNode failures, DataNode failures and network partitions. + +Data Disk Failure, Heartbeats and Re-Replication +Each DataNode sends a Heartbeat message to the NameNode periodically. A network partition can cause a subset of DataNodes to lose connectivity with the NameNode. The NameNode detects this condition by the absence of a Heartbeat message. The NameNode marks DataNodes without recent Heartbeats as dead and does not forward any new IO requests to them. Any data that was registered to a dead DataNode is not available to HDFS any more. DataNode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication whenever necessary. The necessity for re-replication may arise due to many reasons: a DataNode may become unavailable, a replica may become corrupted, a hard disk on a DataNode may fail, or the replication factor of a file may be increased. + +The time-out to mark DataNodes dead is conservatively long (over 10 minutes by default) in order to avoid replication storm caused by state flapping of DataNodes. Users can set shorter interval to mark DataNodes as stale and avoid stale nodes on reading and/or writing by configuration for performance sensitive workloads. + +Cluster Rebalancing +The HDFS architecture is compatible with data rebalancing schemes. A scheme might automatically move data from one DataNode to another if the free space on a DataNode falls below a certain threshold. In the event of a sudden high demand for a particular file, a scheme might dynamically create additional replicas and rebalance other data in the cluster. These types of data rebalancing schemes are not yet implemented. + +Data Integrity +It is possible that a block of data fetched from a DataNode arrives corrupted. This corruption can occur because of faults in a storage device, network faults, or buggy software. The HDFS client software implements checksum checking on the contents of HDFS files. When a client creates an HDFS file, it computes a checksum of each block of the file and stores these checksums in a separate hidden file in the same HDFS namespace. When a client retrieves file contents it verifies that the data it received from each DataNode matches the checksum stored in the associated checksum file. If not, then the client can opt to retrieve that block from another DataNode that has a replica of that block. + +Metadata Disk Failure +The FsImage and the EditLog are central data structures of HDFS. A corruption of these files can cause the HDFS instance to be non-functional. For this reason, the NameNode can be configured to support maintaining multiple copies of the FsImage and EditLog. Any update to either the FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. This synchronous updating of multiple copies of the FsImage and EditLog may degrade the rate of namespace transactions per second that a NameNode can support. However, this degradation is acceptable because even though HDFS applications are very data intensive in nature, they are not metadata intensive. When a NameNode restarts, it selects the latest consistent FsImage and EditLog to use. + +Another option to increase resilience against failures is to enable High Availability using multiple NameNodes either with a shared storage on NFS or using a distributed edit log (called Journal). The latter is the recommended approach. + +Snapshots +Snapshots support storing a copy of data at a particular instant of time. One usage of the snapshot feature may be to roll back a corrupted HDFS instance to a previously known good point in time. + +Data Organization +Data Blocks +HDFS is designed to support very large files. Applications that are compatible with HDFS are those that deal with large data sets. These applications write their data only once but they read it one or more times and require these reads to be satisfied at streaming speeds. HDFS supports write-once-read-many semantics on files. A typical block size used by HDFS is 128 MB. Thus, an HDFS file is chopped up into 128 MB chunks, and if possible, each chunk will reside on a different DataNode. + +Replication Pipelining +When a client is writing data to an HDFS file with a replication factor of three, the NameNode retrieves a list of DataNodes using a replication target choosing algorithm. This list contains the DataNodes that will host a replica of that block. The client then writes to the first DataNode. The first DataNode starts receiving the data in portions, writes each portion to its local repository and transfers that portion to the second DataNode in the list. The second DataNode, in turn starts receiving each portion of the data block, writes that portion to its repository and then flushes that portion to the third DataNode. Finally, the third DataNode writes the data to its local repository. Thus, a DataNode can be receiving data from the previous one in the pipeline and at the same time forwarding data to the next one in the pipeline. Thus, the data is pipelined from one DataNode to the next. + +Accessibility +HDFS can be accessed from applications in many different ways. Natively, HDFS provides a FileSystem Java API for applications to use. A C language wrapper for this Java API and REST API is also available. In addition, an HTTP browser and can also be used to browse the files of an HDFS instance. By using NFS gateway, HDFS can be mounted as part of the client’s local file system. + +FS Shell +HDFS allows user data to be organized in the form of files and directories. It provides a commandline interface called FS shell that lets a user interact with the data in HDFS. The syntax of this command set is similar to other shells (e.g. bash, csh) that users are already familiar with. Here are some sample action/command pairs: + +Action Command +Create a directory named /foodir bin/hadoop dfs -mkdir /foodir +Remove a directory named /foodir bin/hadoop fs -rm -R /foodir +View the contents of a file named /foodir/myfile.txt bin/hadoop dfs -cat /foodir/myfile.txt +FS shell is targeted for applications that need a scripting language to interact with the stored data. + +DFSAdmin +The DFSAdmin command set is used for administering an HDFS cluster. These are commands that are used only by an HDFS administrator. Here are some sample action/command pairs: + +Action Command +Put the cluster in Safemode bin/hdfs dfsadmin -safemode enter +Generate a list of DataNodes bin/hdfs dfsadmin -report +Recommission or decommission DataNode(s) bin/hdfs dfsadmin -refreshNodes +Browser Interface +A typical HDFS install configures a web server to expose the HDFS namespace through a configurable TCP port. This allows a user to navigate the HDFS namespace and view the contents of its files using a web browser. + +Space Reclamation +File Deletes and Undeletes +If trash configuration is enabled, files removed by FS Shell is not immediately removed from HDFS. Instead, HDFS moves it to a trash directory (each user has its own trash directory under /user//.Trash). The file can be restored quickly as long as it remains in trash. + +Most recent deleted files are moved to the current trash directory (/user//.Trash/Current), and in a configurable interval, HDFS creates checkpoints (under /user//.Trash/) for files in current trash directory and deletes old checkpoints when they are expired. See expunge command of FS shell about checkpointing of trash. + +After the expiry of its life in trash, the NameNode deletes the file from the HDFS namespace. The deletion of a file causes the blocks associated with the file to be freed. Note that there could be an appreciable time delay between the time a file is deleted by a user and the time of the corresponding increase in free space in HDFS. + +Following is an example which will show how the files are deleted from HDFS by FS Shell. We created 2 files (test1 & test2) under the directory delete + +$ hadoop fs -mkdir -p delete/test1 +$ hadoop fs -mkdir -p delete/test2 +$ hadoop fs -ls delete/ +Found 2 items +drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:39 delete/test1 +drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:40 delete/test2 +We are going to remove the file test1. The comment below shows that the file has been moved to Trash directory. + +$ hadoop fs -rm -r delete/test1 +Moved: hdfs://localhost:8020/user/hadoop/delete/test1 to trash at: hdfs://localhost:8020/user/hadoop/.Trash/Current +now we are going to remove the file with skipTrash option, which will not send the file to Trash.It will be completely removed from HDFS. + +$ hadoop fs -rm -r -skipTrash delete/test2 +Deleted delete/test2 +We can see now that the Trash directory contains only file test1. + +$ hadoop fs -ls .Trash/Current/user/hadoop/delete/ +Found 1 items\ +drwxr-xr-x - hadoop hadoop 0 2015-05-08 12:39 .Trash/Current/user/hadoop/delete/test1 +So file test1 goes to Trash and file test2 is deleted permanently. + +Decrease Replication Factor +When the replication factor of a file is reduced, the NameNode selects excess replicas that can be deleted. The next Heartbeat transfers this information to the DataNode. The DataNode then removes the corresponding blocks and the corresponding free space appears in the cluster. Once again, there might be a time delay between the completion of the setReplication API call and the appearance of free space in the cluster. + +References +Hadoop JavaDoc API. + +HDFS source code: http://hadoop.apache.org/version_control.html \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..9018f53759f73b1f95a44b900321c7e50d248ccf Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS Architecture.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt new file mode 100644 index 0000000000000000000000000000000000000000..3853475f4a577008101cc19227770dc4ab4c9757 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt @@ -0,0 +1,36 @@ +HDFS +When a dataset outgrows the storage capacity of a single physical machine, it becomes necessary to partition it across a number of separate machines. Filesystems that manage the storage across a network of machines are called distributed filesystems. +Hadoop comes with a distributed filesystem called HDFS, which stands for Hadoop Distributed Filesystem. +The Design of HDFS : +HDFS is a filesystem designed for storing very large files with streaming data access patterns, running on clusters of commodity hardware. +Very large files: +“Very large” in this context means files that are hundreds of megabytes, gigabytes, or terabytes in size. There are Hadoop clusters running today that store petabytes of data. +Streaming data access : +HDFS is built around the idea that the most efficient data processing pattern is a write-once, read-many-times pattern. A dataset is typically generated or copied from source, then various analyses are performed on that dataset over time. +Commodity hardware : +Hadoop doesn’t require expensive, highly reliable hardware to run on. It’s designed to run on clusters of commodity hardware (commonly available hardware available from multiple vendors3) for which the chance of node failure across the cluster is high, at least for large clusters. HDFS is designed to carry on working without a noticeable interruption to the user in the face of such failure. +These are areas where HDFS is not a good fit today: +Low-latency data access : +Applications that require low-latency access to data, in the tens of milliseconds range, will not work well with HDFS. +Lots of small files : +Since the namenode holds filesystem metadata in memory, the limit to the number of files in a filesystem is governed by the amount of memory on the namenode. +Multiple writers, arbitrary file modifications: +Files in HDFS may be written to by a single writer. Writes are always made at the end of the file. There is no support for multiple writers, or for modifications at arbitrary offsets in the file. +HDFS Concepts +Blocks: +HDFS has the concept of a block, but it is a much larger unit—64 MB by default. Files in HDFS are broken into block-sized chunks, which are stored as independent units. +Having a block abstraction for a distributed filesystem brings several benefits.: +The first benefit : +A file can be larger than any single disk in the network. There’s nothing that requires the blocks from a file to be stored on the same disk, so they can take advantage of any of the disks in the cluster. +Second: +Making the unit of abstraction a block rather than a file simplifies the storage subsystem. The storage subsystem deals with blocks, simplifying storage management (since blocks are a fixed size, it is easy to calculate how many can be stored on a given disk) and eliminating metadata concerns. +Third: +Blocks fit well with replication for providing fault tolerance and availability. To insure against corrupted blocks and disk and machine failure, each block is replicated to a small number of physically separate machines (typically three). +Why Is a Block in HDFS So Large? +HDFS blocks are large compared to disk blocks, and the reason is to minimize the cost of seeks. By making a block large enough, the time to transfer the data from the disk can be made to be significantly larger than the time to seek to the start of the block. Thus the time to transfer a large file made of multiple blocks operates at the disk transfer rate. +A quick calculation shows that if the seek time is around 10 ms, and the transfer rate is 100 MB/s, then to make the seek time 1% of the transfer time, we need to make the block size around 100 MB. The default is actually 64 MB, although many HDFS installations use 128 MB blocks. This figure will continue to be revised upward as transfer speeds grow with new generations of disk drives. +Namenodes and Datanodes: +An HDFS cluster has two types of node operating in a master-worker pattern: a namenode (the master) and a number of datanodes (workers). The namenode manages the filesystem namespace. It maintains the filesystem tree and the metadata for all the files and directories in the tree. This information is stored persistently on the local disk in the form of two files: the namespace image and the edit log. The namenode also knows the datanodes on which all the blocks for a given file are located. +Apache Hadoop is designed to have Master Slave architecture: Master: Namenode, JobTracker Slave: {DataNode, TaskTraker}, ….. {DataNode, TaskTraker} HDFS is one primary components of Hadoop cluster and HDFS is designed to have Master-slave architecture. Master: NameNode Slave: {Datanode}…..{Datanode} - The Master (NameNode) manages the file system namespace operations like opening, closing, and renaming files and directories and determines the mapping of blocks to DataNodes along with regulating access to files by clients - Slaves (DataNodes) are responsible for serving read and write requests from the file system’s clients along with perform block creation, deletion, and replication upon instruction from the Master (NameNode). Datanodes are the workhorses of the filesystem. They store and retrieve blocks when they are told to (by clients or the namenode), and they report back to the namenode periodically with lists of blocks that they are storing. NameNode failure: if the machine running the namenode failed, all the files on the filesystem would be lost since there would be no way of knowing how to reconstruct the files from the blocks on the datanodes. What precautions HDFS is taking to recover file system in case of namenode failure: The first way is to back up the files that make up the persistent state of the filesystem metadata. Hadoop can be configured so that the namenode writes its persistent state to multiple filesystems. These writes are synchronous and atomic. The usual configuration choice is to write to local disk as well as a remote NFS mount. +Second way: It is also possible to run a secondary namenode, which despite its name does not act as a namenode. Its main role is to periodically merge the namespace image with the edit log to prevent the edit log from becoming too large. But this can shaped to act as primary namenode. HDFS Federation : The namenode keeps a reference to every file and block in the filesystem in memory, which means that on very large clusters with many files, memory becomes the limiting factor for scaling . HDFS Federation, introduced in the 0.23 release series, allows a cluster to scale by adding namenodes, each of which manages a portion of the filesystem namespace. For example, one namenode might manage all the files rooted under /user, say, and a second namenode might handle files under /share. Each Namenode Namespace volumes are independent of each other, which means namenodes do not communicate with one another, and furthermore the failure of one namenode does not affect the availability of the namespaces managed by other namenodes. Block pool storage is not partitioned, however, so datanodes register with each namenode in the cluster and store blocks from multiple block pools. HDFS High-Availability: The namenode is still a single point of failure (SPOF), since if it did fail, all clients—including MapReduce jobs—would be unable to read, write, or list files, because the namenode is the sole repository of the metadata and the file-to-block mapping. In such an event the whole Hadoop system would effectively be out of service until a new namenode could be brought online. To recover from a failed namenode in this situation, an administrator starts a new primary namenode with one of the filesystem metadata replicas, and configures datanodes and clients to use this new namenode. The new namenode is not able to serve requests until it has i) loaded its namespace image into memory, ii) replayed its edit log, and iii) received enough block reports from the datanodes to leave safe mode. On large clusters with many files and blocks, the time it takes for a namenode to start from cold can be 30 minutes or more. The 0.23 release series of Hadoop remedies this situation by adding support for HDFS high-availability (HA). In this implementation there is a pair of namenodes in an activestandby configuration. In the event of the failure of the active namenode, the standby takes over its duties to continue servicing client requests without a significant interruption. A few architectural changes are needed to allow this to happen:  The namenodes must use highly-available shared storage to share the edit log. + Datanodes must send block reports to both namenodes since the block mappings are stored in a namenode’s memory, and not on disk.  Clients must be configured to handle namenode failover, which uses a mechanism that is transparent to users. Failover and fencing: The transition from the active namenode to the standby is managed by a new entity in the system called the failover controller. Failover controllers are pluggable, but the first implementation uses ZooKeeper to ensure that only one namenode is active. Failover may also be initiated manually by an adminstrator, in the case of routine maintenance, for example. This is known as a graceful failover, since the failover controller arranges an orderly transition for both namenodes to switch roles. In the case of an ungraceful failover, The HA implementation goes to great lengths to ensure that the previously active namenode is prevented from doing any damage and causing corruption—a method known as fencing. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..a11c1e626af77bcce485b28b67ae14d5153560ec Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/HDFS.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" new file mode 100644 index 0000000000000000000000000000000000000000..34635b3559f623c0dba0fdb73f24adecf4d9fa6d --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt" @@ -0,0 +1,161 @@ +Hadoop Distributed File System (HDFS) Architecture – A Guide to HDFS for Every Data Engineer +download +Share +Aniruddha Bhandari — October 28, 2020 +Beginner Big data Data Engineering Hadoop +Overview +Get familiar with Hadoop Distributed File System (HDFS) +Understand the Components of HDFS + + +Introduction +In contemporary times, it is commonplace to deal with massive amounts of data. From your next WhatsApp message to your next Tweet, you are creating data at every step when you interact with technology. Now multiply that by 4.5 billion people on the internet – the math is simply mind-boggling! + +But ever wondered how to handle such data? Is it stored on a single machine? What if the machine fails? Will you lose your lovely 3 AM tweets *cough*? + + + +The answer is No. I am pretty sure you are already thinking about Hadoop. Hadoop is an amazing framework. With Hadoop by your side, you can leverage the amazing powers of Hadoop Distributed File System (HDFS)-the storage component of Hadoop. It is probably the most important component of Hadoop and demands a detailed explanation. + +So, in this article, we will learn what Hadoop Distributed File System (HDFS) really is and about its various components. Also, we will see what makes HDFS tick – that is what makes it so special. Let’s find out! + + + +Table of Contents +What is Hadoop Distributed File System (HDFS)? +What are the components of HDFS? +Blocks in HDFS? +Namenode in HDFS +Datanodes in HDFS +Secondary Node in HDFS +Replication Management +Replication of Blocks +What is a Rack in Hadoop? +Rack Awareness + + +What is Hadoop Distributed File System(HDFS)? +It is difficult to maintain huge volumes of data in a single machine. Therefore, it becomes necessary to break down the data into smaller chunks and store it on multiple machines. + +Filesystems that manage the storage across a network of machines are called distributed file systems. + +Hadoop Distributed File System (HDFS) is the storage component of Hadoop. All data stored on Hadoop is stored in a distributed manner across a cluster of machines. But it has a few properties that define its existence. + +Huge volumes – Being a distributed file system, it is highly capable of storing petabytes of data without any glitches. +Data access – It is based on the philosophy that “the most effective data processing pattern is write-once, the read-many-times pattern”. +Cost-effective – HDFS runs on a cluster of commodity hardware. These are inexpensive machines that can be bought from any vendor. + + +What are the components of the Hadoop Distributed File System(HDFS)? +HDFS has two main components, broadly speaking, – data blocks and nodes storing those data blocks. But there is more to it than meets the eye. So, let’s look at this one by one to get a better understanding. + +HDFS Blocks +HDFS breaks down a file into smaller units. Each of these units is stored on different machines in the cluster. This, however, is transparent to the user working on HDFS. To them, it seems like storing all the data onto a single machine. + +These smaller units are the blocks in HDFS. The size of each of these blocks is 128MB by default, you can easily change it according to requirement. So, if you had a file of size 512MB, it would be divided into 4 blocks storing 128MB each. + +hadoop hdfs blocks + +If, however, you had a file of size 524MB, then, it would be divided into 5 blocks. 4 of these would store 128MB each, amounting to 512MB. And the 5th would store the remaining 12MB. That’s right! This last block won’t take up the complete 128MB on the disk. + +hadoop hdfs blocks split + +But, you must be wondering, why such a huge amount in a single block? Why not multiple blocks of 10KB each? Well, the amount of data with which we generally deal with in Hadoop is usually in the order of petra bytes or higher. + +Therefore, if we create blocks of small size, we would end up with a colossal number of blocks. This would mean we would have to deal with equally large metadata regarding the location of the blocks which would just create a lot of overhead. And we don’t really want that! + +There are several perks to storing data in blocks rather than saving the complete file. + +The file itself would be too large to store on any single disk alone. Therefore, it is prudent to spread it across different machines on the cluster. +It would also enable a proper spread of the workload and prevent the choke of a single machine by taking advantage of parallelism. +Now, you must be wondering, what about the machines in the cluster? How do they store the blocks and where is the metadata stored? Let’s find out. + + + +Namenode in HDFS +HDFS operates in a master-worker architecture, this means that there are one master node and several worker nodes in the cluster. The master node is the Namenode. + +Namenode is the master node that runs on a separate node in the cluster. + +Manages the filesystem namespace which is the filesystem tree or hierarchy of the files and directories. +Stores information like owners of files, file permissions, etc for all the files. +It is also aware of the locations of all the blocks of a file and their size. +All this information is maintained persistently over the local disk in the form of two files: Fsimage and Edit Log. + +Fsimage stores the information about the files and directories in the filesystem. For files, it stores the replication level, modification and access times, access permissions, blocks the file is made up of, and their sizes. For directories, it stores the modification time and permissions. +Edit log on the other hand keeps track of all the write operations that the client performs. This is regularly updated to the in-memory metadata to serve the read requests. +Whenever a client wants to write information to HDFS or read information from HDFS, it connects with the Namenode. The Namenode returns the location of the blocks to the client and the operation is carried out. + +Yes, that’s right, the Namenode does not store the blocks. For that, we have separate nodes. + + + +Datanodes in HDFS +Datanodes are the worker nodes. They are inexpensive commodity hardware that can be easily added to the cluster. + +Datanodes are responsible for storing, retrieving, replicating, deletion, etc. of blocks when asked by the Namenode. + +They periodically send heartbeats to the Namenode so that it is aware of their health. With that, a DataNode also sends a list of blocks that are stored on it so that the Namenode can maintain the mapping of blocks to Datanodes in its memory. + +But in addition to these two types of nodes in the cluster, there is also another node called the Secondary Namenode. Let’s look at what that is. + + + +Secondary Namenode in HDFS +Suppose we need to restart the Namenode, which can happen in case of a failure. This would mean that we have to copy the Fsimage from disk to memory. Also, we would also have to copy the latest copy of Edit Log to Fsimage to keep track of all the transactions. But if we restart the node after a long time, then the Edit log could have grown in size. This would mean that it would take a lot of time to apply the transactions from the Edit log. And during this time, the filesystem would be offline. Therefore, to solve this problem, we bring in the Secondary Namenode. + +Secondary Namenode is another node present in the cluster whose main task is to regularly merge the Edit log with the Fsimage and produce check‐points of the primary’s in-memory file system metadata. This is also referred to as Checkpointing. + +hadop hdfs checkpointing + + + +But the checkpointing procedure is computationally very expensive and requires a lot of memory, which is why the Secondary namenode runs on a separate node on the cluster. + +However, despite its name, the Secondary Namenode does not act as a Namenode. It is merely there for Checkpointing and keeping a copy of the latest Fsimage. + + + +Replication Management in HDFS +Now, one of the best features of HDFS is the replication of blocks which makes it very reliable. But how does it replicate the blocks and where does it store them? Let’s answer those questions now. + + + +Replication of blocks +HDFS is a reliable storage component of Hadoop. This is because every block stored in the filesystem is replicated on different Data Nodes in the cluster. This makes HDFS fault-tolerant. + +The default replication factor in HDFS is 3. This means that every block will have two more copies of it, each stored on separate DataNodes in the cluster. However, this number is configurable. + +hadoop hdfs replication + + + +But you must be wondering doesn’t that mean that we are taking up too much storage. For instance, if we have 5 blocks of 128MB each, that amounts to 5*128*3 = 1920 MB. True. But then these nodes are commodity hardware. We can easily scale the cluster to add more of these machines. The cost of buying machines is much lower than the cost of losing the data! + +Now, you must be wondering, how does Namenode decide which Datanode to store the replicas on? Well, before answering that question, we need to have a look at what is a Rack in Hadoop. + + + +What is a Rack in Hadoop? +A Rack is a collection of machines (30-40 in Hadoop) that are stored in the same physical location. There are multiple racks in a Hadoop cluster, all connected through switches. + +rack + +Rack awareness +Replica storage is a tradeoff between reliability and read/write bandwidth. To increase reliability, we need to store block replicas on different racks and Datanodes to increase fault tolerance. While the write bandwidth is lowest when replicas are stored on the same node. Therefore, Hadoop has a default strategy to deal with this conundrum, also known as the Rack Awareness algorithm. + +For example, if the replication factor for a block is 3, then the first replica is stored on the same Datanode on which the client writes. The second replica is stored on a different Datanode but on a different rack, chosen randomly. While the third replica is stored on the same rack as the second but on a different Datanode, again chosen randomly. If, however, the replication factor was higher, then the subsequent replicas would be stored on random Data Nodes in the cluster. + + rack awareness + + + +Endnotes +I hope by now you have got a solid understanding of what Hadoop Distributed File System(HDFS) is, what are its important components, and how it stores the data. There are however still a few more concepts that we need to cover with respect to Hadoop Distributed File System(HDFS), but that is a story for another article. + +For now, I recommend you go through the following articles to get a better understanding of Hadoop and this Big Data world! + +Hadoop Ecosystem +Introduction to MapReduce +Types of Tables in Apache Hive +Last but not the least, I recommend reading Hadoop: The Definitive Guide by Tom White. This article was highly inspired by it. \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..b5dfeed6371384b0b5746dc922d01952eb726c61 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop Distributed File System (HDFS) Architecture \342\200\223 A Guide to HDFS for Every Data Engineer.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt new file mode 100644 index 0000000000000000000000000000000000000000..880e24ffa9038169edc8a908e17b9b19f9b29884 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt @@ -0,0 +1,198 @@ +Hadoop HDFS Architecture Explanation and Assumptions +Boost your career with Big Data Get Exclusive Offers on Big Data Course!! +This HDFS tutorial by DataFlair is designed to be an all in one package to answer all your questions about HDFS architecture. + +Hadoop Distributed File System(HDFS) is the world’s most reliable storage system. It is best known for its fault tolerance and high availability. + +In this article about HDFS Architecture Guide, you can read all about Hadoop HDFS. + +First of all, we will discuss what is HDFS next with the Assumptions and Goals of HDFS design. This HDFS architecture tutorial will also cover the detailed architecture of Hadoop HDFS including NameNode, DataNode in HDFS, Secondary node, checkpoint node, Backup Node in HDFS. + +HDFS features like Rack awareness, high Availability, Data Blocks, Replication Management, HDFS data read and write operations are also discussed in this HDFS tutorial. + +What is Hadoop HDFS? +HDFS stores very large files running on a cluster of commodity hardware. It works on the principle of storage of less number of large files rather than the huge number of small files. HDFS stores data reliably even in the case of hardware failure. It provides high throughput by providing the data access in parallel. + +HDFS Assumption and Goals +I. Hardware failure +Hardware failure is no more exception; it has become a regular term. HDFS instance consists of hundreds or thousands of server machines, each of which is storing part of the file system’s data. There exist a huge number of components that are very susceptible to hardware failure. This means that there are some components that are always non-functional. So the core architectural goal of HDFS is quick and automatic fault detection/recovery. + +II. Streaming data access +HDFS applications need streaming access to their datasets. Hadoop HDFS is mainly designed for batch processing rather than interactive use by users. The force is on high throughput of data access rather than low latency of data access. It focuses on how to retrieve data at the fastest possible speed while analyzing logs. + +III. Large datasets +HDFS works with large data sets. In standard practices, a file in HDFS is of size ranging from gigabytes to petabytes. The architecture of HDFS should be design in such a way that it should be best for storing and retrieving huge amounts of data. HDFS should provide high aggregate data bandwidth and should be able to scale up to hundreds of nodes on a single cluster. Also, it should be good enough to deal with tons of millions of files on a single instance. + +IV. Simple coherency model +It works on a theory of write-once-read-many access model for files. Once the file is created, written, and closed, it should not be changed. This resolves the data coherency issues and enables high throughput of data access. A MapReduce-based application or web crawler application perfectly fits in this model. As per apache notes, there is a plan to support appending writes to files in the future. + +V. Moving computation is cheaper than moving data +If an application does the computation near the data it operates on, it is much more efficient than done far of. This fact becomes stronger while dealing with large data set. The main advantage of this is that it increases the overall throughput of the system. It also minimizes network congestion. The assumption is that it is better to move computation closer to data instead of moving data to computation. + +VI. Portability across heterogeneous hardware and software platforms +HDFS is designed with the portable property so that it should be portable from one platform to another. This enables the widespread adoption of HDFS. It is the best platform while dealing with a large set of data. + +Introduction to HDFS Architecture +HDFS Architecture + +Hadoop Distributed File System follows the master-slave architecture. Each cluster comprises a single master node and multiple slave nodes. Internally the files get divided into one or more blocks, and each block is stored on different slave machines depending on the replication factor (which you will see later in this article). + +The master node stores and manages the file system namespace, that is information about blocks of files like block locations, permissions, etc. The slave nodes store data blocks of files. + +The Master node is the NameNode and DataNodes are the slave nodes. + +Let’s discuss each of the nodes in the Hadoop HDFS Architecture in detail. + +What is HDFS NameNode? +NameNode is the centerpiece of the Hadoop Distributed File System. It maintains and manages the file system namespace and provides the right access permission to the clients. + +The NameNode stores information about blocks locations, permissions, etc. on the local disk in the form of two files: + +Fsimage: Fsimage stands for File System image. It contains the complete namespace of the Hadoop file system since the NameNode creation. +Edit log: It contains all the recent changes performed to the file system namespace to the most recent Fsimage. +Functions of HDFS NameNode +It executes the file system namespace operations like opening, renaming, and closing files and directories. +NameNode manages and maintains the DataNodes. +It determines the mapping of blocks of a file to DataNodes. +NameNode records each change made to the file system namespace. +It keeps the locations of each block of a file. +NameNode takes care of the replication factor of all the blocks. +NameNode receives heartbeat and block reports from all DataNodes that ensure DataNode is alive. +If the DataNode fails, the NameNode chooses new DataNodes for new replicas. +Before Hadoop2, NameNode was the single point of failure. The High Availability Hadoop cluster architecture introduced in Hadoop 2, allows for two or more NameNodes running in the cluster in a hot standby configuration. + +What is HDFS DataNode? +DataNodes are the slave nodes in Hadoop HDFS. DataNodes are inexpensive commodity hardware. They store blocks of a file. + +Functions of DataNode +DataNode is responsible for serving the client read/write requests. +Based on the instruction from the NameNode, DataNodes performs block creation, replication, and deletion. +DataNodes send a heartbeat to NameNode to report the health of HDFS. +DataNodes also sends block reports to NameNode to report the list of blocks it contains. +What is Secondary NameNode? +HDFS architecture secondary namenode + +Apart from DataNode and NameNode, there is another daemon called the secondary NameNode. Secondary NameNode works as a helper node to primary NameNode but doesn’t replace primary NameNode. + +When the NameNode starts, the NameNode merges the Fsimage and edit logs file to restore the current file system namespace. Since the NameNode runs continuously for a long time without any restart, the size of edit logs becomes too large. This will result in a long restart time for NameNode. + +Secondary NameNode solves this issue. + +Secondary NameNode downloads the Fsimage file and edit logs file from NameNode. + +It periodically applies edit logs to Fsimage and refreshes the edit logs. The updated Fsimage is then sent to the NameNode so that NameNode doesn’t have to re-apply the edit log records during its restart. This keeps the edit log size small and reduces the NameNode restart time. + +If the NameNode fails, the last save Fsimage on the secondary NameNode can be used to recover file system metadata. The secondary NameNode performs regular checkpoints in HDFS. + +What is Checkpoint Node? +The Checkpoint node is a node that periodically creates checkpoints of the namespace. + +Checkpoint Node in Hadoop first downloads Fsimage and edits from the Active Namenode. Then it merges them (Fsimage and edits) locally, and at last, it uploads the new image back to the active NameNode. + +It stores the latest checkpoint in a directory that has the same structure as the Namenode’s directory. This permits the checkpointed image to be always available for reading by the NameNode if necessary. + +What is Backup Node? +A Backup node provides the same checkpointing functionality as the Checkpoint node. + +In Hadoop, Backup node keeps an in-memory, up-to-date copy of the file system namespace. It is always synchronized with the active NameNode state. + +It is not required for the backup node in HDFS architecture to download Fsimage and edits files from the active NameNode to create a checkpoint. It already has an up-to-date state of the namespace state in memory. + +The Backup node checkpoint process is more efficient as it only needs to save the namespace into the local Fsimage file and reset edits. NameNode supports one Backup node at a time. + +This was about the different types of nodes in HDFS Architecture. Further in this HDFS Architecture tutorial, we will learn about the Blocks in HDFS, Replication Management, Rack awareness and read/write operations. + +Let us now study the block in HDFS. + +What are Blocks in HDFS Architecture? +data blocks in hadoop HDFS + +Internally, HDFS split the file into block-sized chunks called a block. The size of the block is 128 Mb by default. One can configure the block size as per the requirement. + +For example, if there is a file of size 612 Mb, then HDFS will create four blocks of size 128 Mb and one block of size 100 Mb. + +The file of a smaller size does not occupy the full block size space in the disk. + +For example, the file of size 2 Mb will occupy only 2 Mb space in the disk. + +The user doesn’t have any control over the location of the blocks. + +Read the HDFS Block article to explore in detail. + +HDFS is highly fault-tolerant. Now, look at what makes HDFS fault-tolerant. + +What is Replication Management? +For a distributed system, the data must be redundant to multiple places so that if one machine fails, the data is accessible from other machines. + +In Hadoop, HDFS stores replicas of a block on multiple DataNodes based on the replication factor. + +The replication factor is the number of copies to be created for blocks of a file in HDFS architecture. + +If the replication factor is 3, then three copies of a block get stored on different DataNodes. So if one DataNode containing the data block fails, then the block is accessible from the other DataNode containing a replica of the block. + +If we are storing a file of 128 Mb and the replication factor is 3, then (3*128=384) 384 Mb of disk space is occupied for a file as three copies of a block get stored. + +This replication mechanism makes HDFS fault-tolerant. + +Read the Fault tolerance article to learn in detail. + +What is Rack Awareness in HDFS Architecture? +Let us now talk about how HDFS store replicas on the DataNodes? What is a rack? What is rack awareness? + +Rack is the collection of around 40-50 machines (DataNodes) connected using the same network switch. If the network goes down, the whole rack will be unavailable. + +Rack Awareness is the concept of choosing the closest node based on the rack information. + +To ensure that all the replicas of a block are not stored on the same rack or a single rack, NameNode follows a rack awareness algorithm to store replicas and provide latency and fault tolerance. + +Suppose if the replication factor is 3, then according to the rack awareness algorithm: + +The first replica will get stored on the local rack. +The second replica will get stored on the other DataNode in the same rack. +The third replica will get stored on a different rack. +HDFS Read and Write Operation +1. Write Operation + +When a client wants to write a file to HDFS, it communicates to the NameNode for metadata. The Namenode responds with a number of blocks, their location, replicas, and other details. Based on information from NameNode, the client directly interacts with the DataNode. + +The client first sends block A to DataNode 1 along with the IP of the other two DataNodes where replicas will be stored. When Datanode 1 receives block A from the client, DataNode 1 copies the same block to DataNode 2 of the same rack. As both the DataNodes are in the same rack, so block transfer via rack switch. Now DataNode 2 copies the same block to DataNode 4 on a different rack. As both the DataNoNes are in different racks, so block transfer via an out-of-rack switch. + +When DataNode receives the blocks from the client, it sends write confirmation to Namenode. + +The same process is repeated for each block of the file. + +2. Read Operation + +To read from HDFS, the client first communicates with the NameNode for metadata. The Namenode responds with the locations of DataNodes containing blocks. After receiving the DataNodes locations, the client then directly interacts with the DataNodes. + +The client starts reading data parallelly from the DataNodes based on the information received from the NameNode. The data will flow directly from the DataNode to the client. + +When a client or application receives all the blocks of the file, it combines these blocks into the form of an original file. + +Go through the HDFS read and write operation article to study how the client can read and write files in Hadoop HDFS. + +Overview Of HDFS Architecture +In Hadoop HDFS, NameNode is the master node and DataNodes are the slave nodes. The file in HDFS is stored as data blocks. + +The file is divided into blocks (A, B, C in the below GIF). These blocks get stored on different DataNodes based on the Rack Awareness Algorithm. Block A on DataNode-1(DN-1), block B on DataNode-6(DN-6), and block C on DataNode-7(DN-7). + +To provide Fault Tolerance, replicas of blocks are created based on the replication factor. + +In the below GIF, 2 replicas of each block is created (using default replication factor 3). Replicas were placed on different DataNodes, thus ensuring data availability even in the case of DataNode failure or rack failure. + +hadoop hdfs architecture + +So, This was all on HDFS Architecture Tutorial. Follow the following links to master HDFS architecture. + +Summary +After reading the HDFS architecture tutorial, we can conclude that the HDFS divides the files into blocks. The size of the block is 128 Mb by default, which we can configure as per the requirements. + +The master node (NameNode) stores and manages the metadata about block locations, blocks of a file, etc.The DataNode stores the actual data blocks. The Master Node manages the DataNodes. + +HDFS creates replicas of blocks and stores them on different DataNodes in order to provide fault tolerance. Also, NameNode uses the Rack Awareness algorithm to improve cluster performance. + +Loving Hadoop? Join our course and Boost Your Career with BIG DATA + +If you face any difficulty in this HDFS Architecture tutorial, please comment and ask. + +Keep Learning!! \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..77a7b1d4ed65c3c0dadc8805203c56f50298e501 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop HDFS Architecture Explanation and Assumptions.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff2709878573fee0a1b9ad45dec9703024d6e3a9 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview-ziyan.txt @@ -0,0 +1,26 @@ +Namenode , Namenode +INode , INode +INode Reference , INode Reference +INodes In Path , INodes In Path +INode Directory , INode Directory +INode With Additional Fields , INode With Additional Fields +Feature , Feature +XAttr Feature , XAttr Feature +File Under Construction Feature , File Under Construction Feature +Directory With Snapshot Feature , Directory With Snapshot Feature +Directory Snapshottable Feature , Directory Snapshottable Feature +Acl Feature , Acl Feature +Directory With Quota Feature , Directory With Quota Feature +Edit Log File Output Stream , Edit Log File Output Stream +Edit Log Backup Output Stream , Edit Log Backup Output Stream +Edit Log Backup Output Stream , Edit log +Quorum Output Stream , Quorum Output Stream +Quorum Output Stream , Byte Range Input Stream +Journal Set Output Stream , Journal Set Output Stream +Edit Log File Input Stream , Edit Log File Input Stream +Edit Log Backup Input Stream , Edit Log Backup Input Stream +Edit Log Backup Input Stream , Edit log +Edit Log Byte Input Stream , Edit Log Byte Input Stream +Edit Log Byte Input Stream , Edit log +Redundant Edit Log Input Stream , Redundant Edit Log Input Stream +Datanode , Datanode \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt new file mode 100644 index 0000000000000000000000000000000000000000..c37613f4b7c9efa2e2ebbee297d61d3a3c1f6596 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt @@ -0,0 +1,149 @@ +This post is part 1 of a 4-part series on monitoring Hadoop health and performance. Part 2 dives into the key metrics to monitor, Part 3 details how to monitor Hadoop performance natively, and Part 4 explains how to monitor a Hadoop deployment with Datadog. + +In this post, we’ll explore each of the technologies that make up a typical Hadoop deployment, and see how they all fit together. If you’re already familiar with HDFS, MapReduce, and YARN, feel free to continue on to Part 2 to dive right into Hadoop’s key performance metrics. + +What is Hadoop? +Apache Hadoop is a framework for distributed computation and storage of very large data sets on computer clusters. Hadoop began as a project to implement Google’s MapReduce programming model, and has become synonymous with a rich ecosystem of related technologies, not limited to: Apache Pig, Apache Hive, Apache Spark, Apache HBase, and others. + +Hadoop has seen widespread adoption by many companies including Facebook, Yahoo!, Adobe, Cisco, eBay, Netflix, and Datadog. + +Hadoop architecture overview +Hadoop has three core components, plus ZooKeeper if you want to enable high availability: + +Hadoop Distributed File System (HDFS) +MapReduce +Yet Another Resource Negotiator (YARN) +ZooKeeper +Note that HDFS uses the term “master” to describe the primary node in a cluster. Where possible, we will use the more inclusive term “leader.” In cases where using an alternative term would introduce ambiguity, such as the YARN-specific class name ApplicationMaster, we preserve the original term. + +HDFS architecture +The Hadoop Distributed File System (HDFS) is the underlying file system of a Hadoop cluster. It provides scalable, fault-tolerant, rack-aware data storage designed to be deployed on commodity hardware. Several attributes set HDFS apart from other distributed file systems. Among them, some of the key differentiators are that HDFS is: + +designed with hardware failure in mind +built for large datasets, with a default block size of 128 MB +optimized for sequential operations +rack-aware +cross-platform and supports heterogeneous clusters +Data in a Hadoop cluster is broken down into smaller units (called blocks) and distributed throughout the cluster. Each block is duplicated twice (for a total of three copies), with the two replicas stored on two nodes in a rack somewhere else in the cluster. Since the data has a default replication factor of three, it is highly available and fault-tolerant. If a copy is lost (because of machine failure, for example), HDFS will automatically re-replicate it elsewhere in the cluster, ensuring that the threefold replication factor is maintained. + +HDFS architecture can vary, depending on the Hadoop version and features needed: + +Vanilla HDFS +High-availability HDFS +HDFS is based on a leader/follower architecture. Each cluster is typically composed of a single NameNode, an optional SecondaryNameNode (for data recovery in the event of failure), and an arbitrary number of DataNodes. + +Hadoop architecture - Vanilla Hadoop deployment diagramA vanilla Hadoop deployment +In addition to managing the file system namespace and associated metadata (file-to-block maps), the NameNode acts as the leader and brokers access to files by clients (though once brokered, clients communicate directly with DataNodes). The NameNode operates entirely in memory, persisting its state to disk. It represents a single point of failure for a Hadoop cluster that is not running in high-availability mode. To mitigate against this, production clusters typically persist state to two local disks (in case of a single disk failure) and also to an NFS-mounted volume (in case of total machine failure). In high-availability mode, Hadoop maintains a standby NameNode to guard against failures. Earlier versions of Hadoop offered an alternative with the introduction of the SecondaryNameNode concept, and many clusters today still operate with a SecondaryNameNode. + +To understand the function of the SecondaryNameNode requires an explanation of the mechanism by which the NameNode stores its state. + +fsimage and the edit log +The NameNode stores file system metadata in two different files: the fsimage and the edit log. The fsimage stores a complete snapshot of the file system’s metadata at a specific moment in time. Incremental changes (like renaming or appending a few bytes to a file) are then stored in the edit log for durability, rather than creating a new fsimage snapshot each time the namespace is modified. With this separation of concerns in places, the NameNode can restore its state by loading the fsimage and performing all the transforms from the edit log, restoring the file system to its most recent state. + +Hadoop architecture - Secondary NameNode architecture diagram +Through RPC calls, the SecondaryNameNode is able to independently update its copy of the fsimage each time changes are made to the edit log. Thus, if the NameNode goes down in the presence of a SecondaryNameNode, the NameNode doesn’t need to replay the edit log on top of the fsimage; cluster administrators can retrieve an updated copy of the fsimage from the SecondaryNameNode. + +SecondaryNameNodes provide a means for much faster recovery in the event of NameNode failure. Despite its name, though, it is not a drop-in replacement for the NameNode and does not provide a means for automated failover. + +HA NameNode service +Early versions of Hadoop introduced several concepts (like SecondaryNameNodes, among others) to make the NameNode more resilient. With Hadoop 2.0 and Standby NameNodes, a mechanism for true high availability was realized. + +Standby NameNodes, which are incompatible with SecondaryNameNodes, provide automatic failover in the event of primary NameNode failure. Achieving high availability with Standby NameNodes requires shared storage between the primary and standbys (for the edit log). + +Though there are two options for the necessary shared storage—NFS and Quorum Journal Manager(QJM)—only QJM is considered production-ready. + +NameNode and QJM +Using the Quorum Journal Manager (QJM) is the preferred method for achieving high availability for HDFS. + +Using QJM to maintain consistency of Active and Standby state requires that both nodes be able to communicate with a group of JournalNodes (JNs). When the Active node modifies the namespace, it logs a record of the change to a majority of JournalNodes. The StandbyNode watches the JNs for changes to the edit log and applies them to its own namespace. + +Hadoop architecture - QJM interaction diagram +JournalNode daemons have relatively low overhead, so provisioning additional machines for them is unnecessary—the daemons can be run on the same machines as existing Hadoop nodes. Typically, a daemon is run on the ResourceManager as well as on each of the two NameNodes. Because edit log changes require a quorum of JNs, you must maintain an odd number of at least three daemons running at any one time. JournalNodes can tolerate failures of at most (N - 1) / 2 nodes (where N is the number of JNs). + +Alternative file systems +HDFS is the canonical file system for Hadoop, but Hadoop’s file system abstraction supports a number of alternative file systems, including the local file system, FTP, AWS S3, Azure’s file system, and OpenStack’s Swift. The file system used is determined by the access URI, e.g., file: for the local file system, s3: for data stored on Amazon S3, etc. Most of these have limitations, though, and in production HDFS is almost always the file system used for the cluster. + +MapReduce overview +MapReduce is a framework tailor-made for processing large datasets in a distributed fashion across multiple machines. The core of a MapReduce job can be, err, reduced to three operations: map an input data set into a collection of pairs, shuffle the resulting data (transfer data to the reducers), then reduce over all pairs with the same key. + +The top-level unit of work in MapReduce is a job. Each job is composed of one or more map or reduce tasks. + +The canonical example of a MapReduce job is counting word frequencies in a body of text. The image below illustrates such an example: + +Hadoop architecture - MapReduce word frequency flow diagram +Key differences between versions +In earlier versions of Hadoop (pre-2.0), MapReduce took care of its own resource allocation and job scheduling as well as the actual computation. + +Newer versions of Hadoop (2.0+) decouple the scheduling from the computation with YARN, which handles the allocation of computational resources for MapReduce jobs. This allows other processing frameworks (see below) to share the cluster without resource contention. + +Other frameworks +Though Hadoop comes with MapReduce out of the box, a number of computing frameworks have been developed for or adapted to the Hadoop ecosystem. Among the more popular are Apache Spark and Apache Tez. This article series will focus on MapReduce as the compute framework. + +Untangling YARN +YARN (Yet Another Resource Negotiator) is the framework responsible for assigning computational resources for application execution. + +Hadoop architecture - YARN architecture diagram +YARN consists of three core components: + +ResourceManager (one per cluster) +ApplicationMaster (one per application) +NodeManagers (one per node) +Caution, overloaded terms ahead +YARN uses some very common terms in uncommon ways. For example, when most people hear “container”, they think Docker. In the Hadoop ecosystem, it takes on a new meaning: a Resource Container (RC) represents a collection of physical resources. It is an abstraction used to bundle resources into distinct, allocatable units. + +“Application” is another overloaded term—in YARN, an application represents a set of tasks that are to be executed together. Application in YARN is synonymous with MapReduce’s job concept. + +ResourceManager +The ResourceManager is the rack-aware leader node in YARN. It is responsible for taking inventory of available resources and runs several critical services, the most important of which is the Scheduler. + +Scheduler +The Scheduler component of the YARN ResourceManager allocates resources to running applications. It is a pure scheduler in that it does not monitor or track application status or progress. As it performs no monitoring, it cannot guarantee that tasks will restart should they fail. + +As of Hadoop 2.7.2, YARN supports several scheduler policies: the CapacityScheduler, the FairScheduler, and the FIFO (first in first out) Scheduler. The default scheduler varies by Hadoop distribution, but no matter the policy used, the Scheduler allocates resources by assigning containers (bundles of physical resources) to the requesting ApplicationMaster. + +ApplicationMaster +Each application running on Hadoop has its own dedicated ApplicationMaster instance. This instance lives in its own, separate container on one of the nodes in the cluster. Each application’s ApplicationMaster periodically sends heartbeat messages to the ResourceManager, as well as requests for additional resources, if needed. Additional resources are granted by the ResourceManager through the assignment of Container Resource leases, which serve as reservations for containers on NodeManagers. + +The ApplicationMaster oversees the execution of an application over its full lifespan, from requesting additional containers from the ResourceManger, to submitting container release requests to the NodeManager. + +NodeManagers +The NodeManager is a per-node agent tasked with overseeing containers throughout their lifecycles, monitoring container resource usage, and periodically communicating with the ResourceManager. + +Conceptually, NodeManagers are much like TaskTrackers in earlier versions of Hadoop. Whereas TaskTrackers used a fixed number of map and reduce slots for scheduling, NodeManagers have a number of dynamically created, arbitrarily-sized Resource Containers (RCs). Unlike slots in MR1, RCs can be used for map tasks, reduce tasks, or tasks from other frameworks. + +Executing applications with YARN +Hadoop architecture - YARN application execution diagram +Typical application execution with YARN follows this flow: + +Client program submits the MapReduce application to the ResourceManager, along with information to launch the application-specific ApplicationMaster. +ResourceManager negotiates a container for the ApplicationMaster and launches the ApplicationMaster. +ApplicationMaster boots and registers with the ResourceManager, allowing the original calling client to interface directly with the ApplicationMaster. +ApplicationMaster negotiates resources (resource containers) for client application. +ApplicationMaster gives the container launch specification to the NodeManager, which launches a container for the application. +During execution, client polls ApplicationMaster for application status and progress. +Upon completion, ApplicationMaster deregisters with the ResourceManager and shuts down, returning its containers to the resource pool. +ZooKeeper +Apache ZooKeeper is a popular tool used for coordination and synchronization of distributed systems. Since Hadoop 2.0, ZooKeeper has become an essential service for Hadoop clusters, providing a mechanism for enabling high-availability of former single points of failure, specifically the HDFS NameNode and YARN ResourceManager. + +HDFS and ZooKeeper +Hadoop architecture - NameNode HA with ZooKeeper diagram +In previous versions of Hadoop, the NameNode represented a single point of failure—should the NameNode fail, the entire HDFS cluster would become unavailable as the metadata containing the file-to-block mappings would be lost. + +Hadoop 2.0 brought many improvements, among them a high-availability NameNode service. When ZooKeeper is used in conjunction with QJM or NFS, it enables automatic failover. + +Automatic NameNode failover requires two components: a ZooKeeper quorum, and a ZKFailoverController (ZKFC) process running on each NameNode. The NameNode and Standby NameNodes maintain persistent sessions in ZooKeeper, with the NameNode holding a special, ephemeral “lock” znode (the equivalent of a file or directory, in a regular file system); if the NameNode does not maintain contact with the ZooKeeper ensemble, its session is expired, triggering a failover (handled by ZKFC). + +ZKFailoverController is a process that runs alongside the NameNode and Standby NameNodes, periodically checking the health of the node it is running on. On healthy nodes, ZKFC will try to acquire the lock znode, succeeding if no other node holds the lock (which means the primary NameNode has failed). Once the lock is acquired, the new NameNode transitions to the active NameNode. + +YARN and ZooKeeper +Hadoop architecture - ResourceManager HA with ZooKeeper diagram +When YARN was initially created, its ResourceManager represented a single point of failure—if NodeManagers lost contact with the ResourceManager, all jobs in progress would be halted, and no new jobs could be assigned. + +Hadoop 2.4 improved YARN’s resilience with the release of the ResourceManager high-availability feature. The new feature incorporates ZooKeeper to allow for automatic failover to a standby ResourceManager in the event of the primary’s failure. + +Like HDFS, YARN uses a similar, ZooKeeper-managed lock to ensure only one ResourceManager is active at once. Unlike HDFS, YARN’s automatic failover mechanism does not run as a separate process—instead, its ActiveStandbyElector service is part of the ResourceManager process itself. Like ZKFailoverController, the ActiveStandbyElector service on each ResourceManager continuously vies for control of an ephemeral znode, ActiveStandbyElectorLock. Because the node is ephemeral, if the currently active RM allows the session to expire, the RM that successfully acquires a lock on the ActiveStandbyElectorLock will automatically be promoted to the active state. + +From theory, to practice +In this post, we’ve explored all the core components found in a standard Hadoop cluster. + +Read on to the next article in this series for an examination of Hadoop’s key performance metrics and health indicators. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..d90b1bf7c43b49195e8ddfe05c5d530bb3833417 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop architectural overview.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" new file mode 100644 index 0000000000000000000000000000000000000000..accbf130a9e4e2b71e0bcee12873c1ddfdf6d3af --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt" @@ -0,0 +1,1138 @@ +Hadoop clusters with Kove® XPD™ persistent memory + + +Mark Kerzner (mark@hadoopilluminated.com), Greg Keller (greg@r-hpc.com), Ivan Lazarov (ivan.lazarov@shmsoft.com), Sujee Maniyam (sujee@hadoopilluminated.com) + + +Abstract + + +Since the Hadoop cluster stores its most vital information on its NameNode server in RAM, this represents a potential point of failure and source of data loss. The usual precautions take the form of storing this information on multiple local hard drives and on a remote one. However, even if the data is preserved in the case of failure, it make take many hours to restore the cluster to its operation. + + +In contrast, by running the Hadoop NameNode on the Kove XPD, one can achieve very fast restoration of Hadoop functionality after such failures as a power loss or motherboard failure. This is accomplished by running a modified version of the Hadoop software, which maps the memory space of the NameNode onto the Kove XPD and can be found on GitHub here https://github.com/markkerzner/nn_kove. + + +Standard RAM presents yet another limitation on Hadoop: it makes it limited in size. One can only store as much data (files and blocks) as the RAM will allow. By contrast, Kove XPD is unlimited in size, and thus using it results in the removal of this limitation on the Hadoop cluster size. + + +Background + + +The Hadoop NameNode saves all of its block and file information in memory. This is done for the sake of efficiency, but it is naturally a single point of failure. There are multiple approaches to alleviate this SPOF, ranging from NameNode HA in Hadoop 2 to distributed NameNode. + + +However, a very enticing prospect is running NameNode on persistent memory, provided by the Kove XPD device. + + +The advantages of this implementation would be twofold. + + +1. Persistent memory is resistant to power failure. If this approach is proven viable, the software architecture for Hadoop NameNode on Kove can be simplified. + + +2. The size of the Kove memory is virtually unlimited, and these devices can be scaled well beyond a terabyte. With this much memory, the NameNode can store much more information, lifting the limitations on the number of files stored in Hadoop and obviating the need for federation. + + + +The diagram below summarizes our thoughts up until this point. + + + + +Possible approaches + + +Here is the summary of our approaches we have tried. + + +Given that the NameNode stores following data in memory (simplified view), we have these major actors. + + +machine -> blockList (DataNodeMap, DatanodeDescriptor, BlockInfo) + +block -> machineList (BlocksMap, BlockInfo) + + +Also these structures are referenced within FSImage structures (INode, FSDirectory) and some additional structures like CorruptReplicasMap, recentInvalidateSets, PendingBlockInfo, ExcessReplicateMap, PendingReplicationBlocks, UnderReplicatedBlocks. + + +All structures of interest are centered around FSNamesystem and are relatively tightly coupled, which implies careful refactoring with small steps. + +Here is how new HDFS file is created +1). Client -> Namenode + + DFSClient + +DFSOutputStream + + namenode.addBlock (through RPC) + + FSNameSystem.getAdditionalBlock() + + lease + + replicator.chooseTargets() -> DatanodeDesciptor[] + + newBlock = allocateBlock(src, pathINodes); + + FSDirectory.addBlock + + // associate the new list of blocks with this file + +namesystem.blocksMap.addINode(block, fileNode); + +BlockInfo blockInfo = namesystem.blocksMap.getStoredBlock(block); + + fileNode.addBlock(blockInfo); + +pendingFile.setTargets(targets); + + +2). Client -> Datanode + +connect to Datanode directly and transfer data... + +Class diagram displaying some of affected classes + + +Possible ways to implement storing Namenode data on Kove instead of memory +Data exchange with Kove requires usage of special buffer registered with API. Registration takes time (about 100 microseconds), and same for copying data to/from buffers (each read/write about 1 microsecond). + +Thus we have 4 ways to use it: + + +a. Create and register big buffer ourselves and do all modifications right inside the buffer. The buffer has to fit into normal memory. This is the fastest: buffer is created once at the start of NameNode, and cost of data transfer to/from other memory areas is minimized. This is the question: “Will it be easy to implement access to ‘native’ data structure from DataNode and does this bring more overhead?” This is likely the longest to implement. + + +b. Create and register smaller buffer(s) ourselves and use them multiple times for different chunks of data. This will require moving data to and from buffers which takes some time. But hopefully this also means fewer changes in DataNode data structures. Not limited to normal memory size. + + +c. Some combination of (a) and (b): try to cache most frequently/recently accessed areas (blocks?) in buffers registered for data exchange. When a certain area is in memory, we modify it in place and transfer to Kove. If not, we obsolete some buffer and use it for area of interest. May be an improvement over (b) but additional code to deal with caching, and need to see what overhead will caching itself give. + + +d. Have API registering memory and transferring the data behind the scenes. Easiest to implement, but probably the slowest: will read/write to library created buffers and occasionally have new buffers registered + + + +Our implementation + + +In the end, we have implemented the NameNode changes using the EHCache library. Ehcache is an open source, standards-based cache for boosting performance, offloading your database, and simplifying scalability. It's the most widely-used Java-based cache because it's robust, proven, and full-featured. + + +We used it to replace the Java objects with EHCache objects and stored them to the XPD. Now came the time to test. + + +This implementation can be found on github here, https://github.com/markkerzner/nn_kove. + + +Testing + + +For testing with used NNBench and a combination of teragen/terasort. The results of the runs are given below. + + +One may notice that the performance of the cluster when using Kove is about 50% of the in-memory Hadoop code. This is to be expected. For our initial prototype treated the Kove XPD as KDSA block device, since it was easier to implement. The proper way, however, will be to use direct writes with Java to C interface, which has the performance of twice the block device. Thus, with the more meticulous implementation we would achieve the speed comparable to in-memory Hadoop code. + + +Appendix: test results + + +There are four groups of test results given below. + + +============ BLOCKSMAP + KOVE ============ + + +---- terasort ---- + + + +hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input + +hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output + + +13/08/07 07:12:53 INFO mapred.JobClient: map 0% reduce 0% + +13/08/07 07:12:58 INFO mapred.JobClient: map 100% reduce 0% + +13/08/07 07:13:05 INFO mapred.JobClient: map 100% reduce 33% + +13/08/07 07:13:07 INFO mapred.JobClient: map 100% reduce 100% + +13/08/07 07:13:07 INFO mapred.JobClient: Job complete: job_201308070712_0002 + +13/08/07 07:13:07 INFO mapred.JobClient: Counters: 30 + +13/08/07 07:13:07 INFO mapred.JobClient: Job Counters + +13/08/07 07:13:07 INFO mapred.JobClient: Launched reduce tasks=1 + +13/08/07 07:13:07 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6462 + +13/08/07 07:13:07 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 + +13/08/07 07:13:07 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 + +13/08/07 07:13:07 INFO mapred.JobClient: Rack-local map tasks=2 + +13/08/07 07:13:07 INFO mapred.JobClient: Launched map tasks=2 + +13/08/07 07:13:07 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9238 + +13/08/07 07:13:07 INFO mapred.JobClient: File Input Format Counters + +13/08/07 07:13:07 INFO mapred.JobClient: Bytes Read=50000000 + +13/08/07 07:13:07 INFO mapred.JobClient: File Output Format Counters + +13/08/07 07:13:07 INFO mapred.JobClient: Bytes Written=50000000 + +13/08/07 07:13:07 INFO mapred.JobClient: FileSystemCounters + +13/08/07 07:13:07 INFO mapred.JobClient: FILE_BYTES_READ=51000264 + +13/08/07 07:13:07 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 + +13/08/07 07:13:07 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102164352 + +13/08/07 07:13:07 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 + +13/08/07 07:13:07 INFO mapred.JobClient: Map-Reduce Framework + +13/08/07 07:13:07 INFO mapred.JobClient: Map output materialized bytes=51000012 + +13/08/07 07:13:07 INFO mapred.JobClient: Map input records=500000 + +13/08/07 07:13:07 INFO mapred.JobClient: Reduce shuffle bytes=51000012 + +13/08/07 07:13:07 INFO mapred.JobClient: Spilled Records=1000000 + +13/08/07 07:13:07 INFO mapred.JobClient: Map output bytes=50000000 + +13/08/07 07:13:07 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 + +13/08/07 07:13:07 INFO mapred.JobClient: CPU time spent (ms)=6860 + +13/08/07 07:13:07 INFO mapred.JobClient: Map input bytes=50000000 + +13/08/07 07:13:07 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 + +13/08/07 07:13:07 INFO mapred.JobClient: Combine input records=0 + +13/08/07 07:13:07 INFO mapred.JobClient: Reduce input records=500000 + +13/08/07 07:13:07 INFO mapred.JobClient: Reduce input groups=500000 + +13/08/07 07:13:07 INFO mapred.JobClient: Combine output records=0 + +13/08/07 07:13:07 INFO mapred.JobClient: Physical memory (bytes) snapshot=615641088 + +13/08/07 07:13:07 INFO mapred.JobClient: Reduce output records=500000 + +13/08/07 07:13:07 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2303033344 + +13/08/07 07:13:07 INFO mapred.JobClient: Map output records=500000 + +13/08/07 07:13:07 INFO terasort.TeraSort: done + + + +map() completion: 1.0 + +reduce() completion: 1.0 + + +Counters: 30 + + Job Counters + + Launched reduce tasks=1 + + SLOTS_MILLIS_MAPS=6462 + + + Total time spent by all reduces waiting after reserving slots (ms)=0 + + Total time spent by all maps waiting after reserving slots (ms)=0 + + Rack-local map tasks=2 + + Launched map tasks=2 + + SLOTS_MILLIS_REDUCES=9238 + + + File Input Format Counters + + Bytes Read=50000000 + + File Output Format Counters + + Bytes Written=50000000 + + FileSystemCounters + + FILE_BYTES_READ=51000264 + + HDFS_BYTES_READ=50000218 + + FILE_BYTES_WRITTEN=102164352 + + + HDFS_BYTES_WRITTEN=50000000 + + Map-Reduce Framework + + Map output materialized bytes=51000012 + + Map input records=500000 + + Reduce shuffle bytes=51000012 + + Spilled Records=1000000 + + Map output bytes=50000000 + + Total committed heap usage (bytes)=602996736 + + CPU time spent (ms)=6860 + + + Map input bytes=50000000 + + SPLIT_RAW_BYTES=218 + + Combine input records=0 + + Reduce input records=500000 + + Reduce input groups=500000 + + Combine output records=0 + + Physical memory (bytes) snapshot=615641088 + + Reduce output records=500000 + + Virtual memory (bytes) snapshot=2303033344 + + Map output records=500000 + + + +---- nn bench ---- + + +hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 + + +13/08/07 07:53:08 INFO hdfs.NNBench: -------------- NNBench -------------- : + +13/08/07 07:53:08 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 + +13/08/07 07:53:08 INFO hdfs.NNBench: Date & time: 2013-08-07 07:53:08,57 + +13/08/07 07:53:08 INFO hdfs.NNBench: + +13/08/07 07:53:08 INFO hdfs.NNBench: Test Operation: create_write + +13/08/07 07:53:08 INFO hdfs.NNBench: Start time: 2013-08-07 07:45:15,177 + +13/08/07 07:53:08 INFO hdfs.NNBench: Maps to run: 2 + +13/08/07 07:53:08 INFO hdfs.NNBench: Reduces to run: 1 + +13/08/07 07:53:08 INFO hdfs.NNBench: Block Size (bytes): 1 + +13/08/07 07:53:08 INFO hdfs.NNBench: Bytes to write: 20 + +13/08/07 07:53:08 INFO hdfs.NNBench: Bytes per checksum: 1 + +13/08/07 07:53:08 INFO hdfs.NNBench: Number of files: 100 + +13/08/07 07:53:08 INFO hdfs.NNBench: Replication factor: 1 + +13/08/07 07:53:08 INFO hdfs.NNBench: Successful file operations: 200 + +13/08/07 07:53:08 INFO hdfs.NNBench: + +13/08/07 07:53:08 INFO hdfs.NNBench: # maps that missed the barrier: 0 + +13/08/07 07:53:08 INFO hdfs.NNBench: # exceptions: 0 + +13/08/07 07:53:08 INFO hdfs.NNBench: + +13/08/07 07:53:08 INFO hdfs.NNBench: TPS: Create/Write/Close: 65 + +13/08/07 07:53:08 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 60.03 + +13/08/07 07:53:08 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.58 + +13/08/07 07:53:08 INFO hdfs.NNBench: Avg Lat (ms): Close: 56.375 + +13/08/07 07:53:08 INFO hdfs.NNBench: + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: AL Total #1: 716 + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: AL Total #2: 11275 + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 12006 + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 6143.0 + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: Late maps: 0 + +13/08/07 07:53:08 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 + + +------------------------------------------------------------------------------------------------------------------- + +============ BLOCKSMAP + DISK ============ + + +---- terasort ---- + + + +hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input + +hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output + + +13/08/07 08:06:46 INFO mapred.JobClient: Running job: job_201308070806_0002 + +13/08/07 08:06:47 INFO mapred.JobClient: map 0% reduce 0% + +13/08/07 08:06:52 INFO mapred.JobClient: map 100% reduce 0% + +13/08/07 08:06:59 INFO mapred.JobClient: map 100% reduce 33% + +13/08/07 08:07:01 INFO mapred.JobClient: map 100% reduce 100% + +13/08/07 08:07:01 INFO mapred.JobClient: Job complete: job_201308070806_0002 + +13/08/07 08:07:01 INFO mapred.JobClient: Counters: 30 + +13/08/07 08:07:01 INFO mapred.JobClient: Job Counters + +13/08/07 08:07:01 INFO mapred.JobClient: Launched reduce tasks=1 + +13/08/07 08:07:01 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6541 + +13/08/07 08:07:01 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 + +13/08/07 08:07:01 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 + +13/08/07 08:07:01 INFO mapred.JobClient: Rack-local map tasks=2 + +13/08/07 08:07:01 INFO mapred.JobClient: Launched map tasks=2 + +13/08/07 08:07:01 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9293 + +13/08/07 08:07:01 INFO mapred.JobClient: File Input Format Counters + +13/08/07 08:07:01 INFO mapred.JobClient: Bytes Read=50000000 + +13/08/07 08:07:01 INFO mapred.JobClient: File Output Format Counters + +13/08/07 08:07:01 INFO mapred.JobClient: Bytes Written=50000000 + +13/08/07 08:07:01 INFO mapred.JobClient: FileSystemCounters + +13/08/07 08:07:01 INFO mapred.JobClient: FILE_BYTES_READ=51000264 + +13/08/07 08:07:01 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 + +13/08/07 08:07:01 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102156988 + +13/08/07 08:07:01 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 + +13/08/07 08:07:01 INFO mapred.JobClient: Map-Reduce Framework + +13/08/07 08:07:01 INFO mapred.JobClient: Map output materialized bytes=51000012 + +13/08/07 08:07:01 INFO mapred.JobClient: Map input records=500000 + +13/08/07 08:07:01 INFO mapred.JobClient: Reduce shuffle bytes=51000012 + +13/08/07 08:07:01 INFO mapred.JobClient: Spilled Records=1000000 + +13/08/07 08:07:01 INFO mapred.JobClient: Map output bytes=50000000 + +13/08/07 08:07:01 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 + +13/08/07 08:07:01 INFO mapred.JobClient: CPU time spent (ms)=6940 + +13/08/07 08:07:01 INFO mapred.JobClient: Map input bytes=50000000 + +13/08/07 08:07:01 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 + +13/08/07 08:07:01 INFO mapred.JobClient: Combine input records=0 + +13/08/07 08:07:01 INFO mapred.JobClient: Reduce input records=500000 + +13/08/07 08:07:01 INFO mapred.JobClient: Reduce input groups=500000 + +13/08/07 08:07:01 INFO mapred.JobClient: Combine output records=0 + +13/08/07 08:07:01 INFO mapred.JobClient: Physical memory (bytes) snapshot=612827136 + +13/08/07 08:07:01 INFO mapred.JobClient: Reduce output records=500000 + +13/08/07 08:07:01 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2305966080 + +13/08/07 08:07:01 INFO mapred.JobClient: Map output records=500000 + +13/08/07 08:07:01 INFO terasort.TeraSort: done + + + +Counters: 30 + + Job Counters + + Launched reduce tasks=1 + + SLOTS_MILLIS_MAPS=6541 + + + Total time spent by all reduces waiting after reserving slots (ms)=0 + + Total time spent by all maps waiting after reserving slots (ms)=0 + + Rack-local map tasks=2 + + Launched map tasks=2 + + SLOTS_MILLIS_REDUCES=9293 + + + File Input Format Counters + + Bytes Read=50000000 + + File Output Format Counters + + Bytes Written=50000000 + + FileSystemCounters + + FILE_BYTES_READ=51000264 + + HDFS_BYTES_READ=50000218 + + FILE_BYTES_WRITTEN=102156988 + + + HDFS_BYTES_WRITTEN=50000000 + + Map-Reduce Framework + + Map output materialized bytes=51000012 + + Map input records=500000 + + Reduce shuffle bytes=51000012 + + Spilled Records=1000000 + + Map output bytes=50000000 + + Total committed heap usage (bytes)=602996736 + + CPU time spent (ms)=6940 + + + Map input bytes=50000000 + + SPLIT_RAW_BYTES=218 + + Combine input records=0 + + Reduce input records=500000 + + Reduce input groups=500000 + + Combine output records=0 + + Physical memory (bytes) snapshot=612827136 + + Reduce output records=500000 + + Virtual memory (bytes) snapshot=2305966080 + + Map output records=500000 + + + + + +---- nn bench ---- + + +hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 + + +13/08/07 08:11:17 INFO hdfs.NNBench: -------------- NNBench -------------- : + +13/08/07 08:11:17 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 + +13/08/07 08:11:17 INFO hdfs.NNBench: Date & time: 2013-08-07 08:11:17,388 + +13/08/07 08:11:17 INFO hdfs.NNBench: + +13/08/07 08:11:17 INFO hdfs.NNBench: Test Operation: create_write + +13/08/07 08:11:17 INFO hdfs.NNBench: Start time: 2013-08-07 08:11:01,121 + +13/08/07 08:11:17 INFO hdfs.NNBench: Maps to run: 2 + +13/08/07 08:11:17 INFO hdfs.NNBench: Reduces to run: 1 + +13/08/07 08:11:17 INFO hdfs.NNBench: Block Size (bytes): 1 + +13/08/07 08:11:17 INFO hdfs.NNBench: Bytes to write: 20 + +13/08/07 08:11:17 INFO hdfs.NNBench: Bytes per checksum: 1 + +13/08/07 08:11:17 INFO hdfs.NNBench: Number of files: 100 + +13/08/07 08:11:17 INFO hdfs.NNBench: Replication factor: 1 + +13/08/07 08:11:17 INFO hdfs.NNBench: Successful file operations: 200 + +13/08/07 08:11:17 INFO hdfs.NNBench: + +13/08/07 08:11:17 INFO hdfs.NNBench: # maps that missed the barrier: 0 + +13/08/07 08:11:17 INFO hdfs.NNBench: # exceptions: 0 + +13/08/07 08:11:17 INFO hdfs.NNBench: + +13/08/07 08:11:17 INFO hdfs.NNBench: TPS: Create/Write/Close: 65 + +13/08/07 08:11:17 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 58.86 + +13/08/07 08:11:17 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.18 + +13/08/07 08:11:17 INFO hdfs.NNBench: Avg Lat (ms): Close: 55.59 + +13/08/07 08:11:17 INFO hdfs.NNBench: + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: AL Total #1: 636 + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: AL Total #2: 11118 + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 11772 + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 6122.0 + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: Late maps: 0 + +13/08/07 08:11:17 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 + +13/08/07 08:11:17 INFO hdfs.NNBench: + + + +============ REGULAR HADOOP + DISK ============ + + +---- terasort ---- + + + +hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input + +hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output + + +13/08/07 08:26:03 INFO mapred.JobClient: Running job: job_201308070825_0002 + +13/08/07 08:26:04 INFO mapred.JobClient: map 0% reduce 0% + +13/08/07 08:26:08 INFO mapred.JobClient: map 100% reduce 0% + +13/08/07 08:26:15 INFO mapred.JobClient: map 100% reduce 33% + +13/08/07 08:26:17 INFO mapred.JobClient: map 100% reduce 100% + +13/08/07 08:26:17 INFO mapred.JobClient: Job complete: job_201308070825_0002 + +13/08/07 08:26:17 INFO mapred.JobClient: Counters: 30 + +13/08/07 08:26:17 INFO mapred.JobClient: Job Counters + +13/08/07 08:26:17 INFO mapred.JobClient: Launched reduce tasks=1 + +13/08/07 08:26:17 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6249 + +13/08/07 08:26:17 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 + +13/08/07 08:26:17 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 + +13/08/07 08:26:17 INFO mapred.JobClient: Launched map tasks=2 + +13/08/07 08:26:17 INFO mapred.JobClient: Data-local map tasks=2 + +13/08/07 08:26:17 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9218 + +13/08/07 08:26:17 INFO mapred.JobClient: File Input Format Counters + +13/08/07 08:26:17 INFO mapred.JobClient: Bytes Read=50000000 + +13/08/07 08:26:17 INFO mapred.JobClient: File Output Format Counters + +13/08/07 08:26:17 INFO mapred.JobClient: Bytes Written=50000000 + +13/08/07 08:26:17 INFO mapred.JobClient: FileSystemCounters + +13/08/07 08:26:17 INFO mapred.JobClient: FILE_BYTES_READ=51000264 + +13/08/07 08:26:17 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 + +13/08/07 08:26:17 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102156990 + +13/08/07 08:26:17 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 + +13/08/07 08:26:17 INFO mapred.JobClient: Map-Reduce Framework + +13/08/07 08:26:17 INFO mapred.JobClient: Map output materialized bytes=51000012 + +13/08/07 08:26:17 INFO mapred.JobClient: Map input records=500000 + +13/08/07 08:26:17 INFO mapred.JobClient: Reduce shuffle bytes=51000012 + +13/08/07 08:26:17 INFO mapred.JobClient: Spilled Records=1000000 + +13/08/07 08:26:17 INFO mapred.JobClient: Map output bytes=50000000 + +13/08/07 08:26:17 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 + +13/08/07 08:26:17 INFO mapred.JobClient: CPU time spent (ms)=6690 + +13/08/07 08:26:17 INFO mapred.JobClient: Map input bytes=50000000 + +13/08/07 08:26:17 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 + +13/08/07 08:26:17 INFO mapred.JobClient: Combine input records=0 + +13/08/07 08:26:17 INFO mapred.JobClient: Reduce input records=500000 + +13/08/07 08:26:17 INFO mapred.JobClient: Reduce input groups=500000 + +13/08/07 08:26:17 INFO mapred.JobClient: Combine output records=0 + +13/08/07 08:26:17 INFO mapred.JobClient: Physical memory (bytes) snapshot=609116160 + +13/08/07 08:26:17 INFO mapred.JobClient: Reduce output records=500000 + +13/08/07 08:26:17 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2309636096 + +13/08/07 08:26:17 INFO mapred.JobClient: Map output records=500000 + +13/08/07 08:26:17 INFO terasort.TeraSort: done + + + +Counters: 30 + + Job Counters + + Launched reduce tasks=1 + + SLOTS_MILLIS_MAPS=6249 + + + Total time spent by all reduces waiting after reserving slots (ms)=0 + + Total time spent by all maps waiting after reserving slots (ms)=0 + + Launched map tasks=2 + + Data-local map tasks=2 + + SLOTS_MILLIS_REDUCES=9218 + + + File Input Format Counters + + Bytes Read=50000000 + + File Output Format Counters + + Bytes Written=50000000 + + FileSystemCounters + + FILE_BYTES_READ=51000264 + + HDFS_BYTES_READ=50000218 + + FILE_BYTES_WRITTEN=102156990 + + + HDFS_BYTES_WRITTEN=50000000 + + Map-Reduce Framework + + Map output materialized bytes=51000012 + + Map input records=500000 + + Reduce shuffle bytes=51000012 + + Spilled Records=1000000 + + Map output bytes=50000000 + + Total committed heap usage (bytes)=602996736 + + CPU time spent (ms)=6690 + + + Map input bytes=50000000 + + SPLIT_RAW_BYTES=218 + + Combine input records=0 + + Reduce input records=500000 + + Reduce input groups=500000 + + Combine output records=0 + + Physical memory (bytes) snapshot=609116160 + + Reduce output records=500000 + + Virtual memory (bytes) snapshot=2309636096 + + Map output records=500000 + + + +---- nn bench ---- + + +hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 + + +13/08/07 08:30:45 INFO hdfs.NNBench: -------------- NNBench -------------- : + +13/08/07 08:30:45 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 + +13/08/07 08:30:45 INFO hdfs.NNBench: Date & time: 2013-08-07 08:30:45,180 + +13/08/07 08:30:45 INFO hdfs.NNBench: + +13/08/07 08:30:45 INFO hdfs.NNBench: Test Operation: create_write + +13/08/07 08:30:45 INFO hdfs.NNBench: Start time: 2013-08-07 08:30:30,955 + +13/08/07 08:30:45 INFO hdfs.NNBench: Maps to run: 2 + +13/08/07 08:30:45 INFO hdfs.NNBench: Reduces to run: 1 + +13/08/07 08:30:45 INFO hdfs.NNBench: Block Size (bytes): 1 + +13/08/07 08:30:45 INFO hdfs.NNBench: Bytes to write: 20 + +13/08/07 08:30:45 INFO hdfs.NNBench: Bytes per checksum: 1 + +13/08/07 08:30:45 INFO hdfs.NNBench: Number of files: 100 + +13/08/07 08:30:45 INFO hdfs.NNBench: Replication factor: 1 + +13/08/07 08:30:45 INFO hdfs.NNBench: Successful file operations: 200 + +13/08/07 08:30:45 INFO hdfs.NNBench: + +13/08/07 08:30:45 INFO hdfs.NNBench: # maps that missed the barrier: 0 + +13/08/07 08:30:45 INFO hdfs.NNBench: # exceptions: 0 + +13/08/07 08:30:45 INFO hdfs.NNBench: + +13/08/07 08:30:45 INFO hdfs.NNBench: TPS: Create/Write/Close: 87 + +13/08/07 08:30:45 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 42.895 + +13/08/07 08:30:45 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.16 + +13/08/07 08:30:45 INFO hdfs.NNBench: Avg Lat (ms): Close: 39.655 + +13/08/07 08:30:45 INFO hdfs.NNBench: + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: AL Total #1: 632 + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: AL Total #2: 7931 + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 8579 + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 4547.0 + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: Late maps: 0 + +13/08/07 08:30:45 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 + +13/08/07 08:30:45 INFO hdfs.NNBench: + + +============ REGULAR HADOOP + KOVE ============ + + +---- terasort ---- + + + +hadoop jar hadoop-examples-1.1.2.jar teragen 500000 /user/hduser/terasort-input + +hadoop jar hadoop-examples-1.1.2.jar terasort /user/hduser/terasort-input /user/hduser/terasort-output + + +13/08/07 08:35:25 INFO mapred.JobClient: Running job: job_201308070834_0002 + +13/08/07 08:35:26 INFO mapred.JobClient: map 0% reduce 0% + +13/08/07 08:35:31 INFO mapred.JobClient: map 100% reduce 0% + +13/08/07 08:35:38 INFO mapred.JobClient: map 100% reduce 33% + +13/08/07 08:35:40 INFO mapred.JobClient: map 100% reduce 100% + +13/08/07 08:35:40 INFO mapred.JobClient: Job complete: job_201308070834_0002 + +13/08/07 08:35:40 INFO mapred.JobClient: Counters: 30 + +13/08/07 08:35:40 INFO mapred.JobClient: Job Counters + +13/08/07 08:35:40 INFO mapred.JobClient: Launched reduce tasks=1 + +13/08/07 08:35:40 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=6390 + +13/08/07 08:35:40 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 + +13/08/07 08:35:40 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 + +13/08/07 08:35:40 INFO mapred.JobClient: Rack-local map tasks=2 + +13/08/07 08:35:40 INFO mapred.JobClient: Launched map tasks=2 + +13/08/07 08:35:40 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=9240 + +13/08/07 08:35:40 INFO mapred.JobClient: File Input Format Counters + +13/08/07 08:35:40 INFO mapred.JobClient: Bytes Read=50000000 + +13/08/07 08:35:40 INFO mapred.JobClient: File Output Format Counters + +13/08/07 08:35:40 INFO mapred.JobClient: Bytes Written=50000000 + +13/08/07 08:35:40 INFO mapred.JobClient: FileSystemCounters + +13/08/07 08:35:40 INFO mapred.JobClient: FILE_BYTES_READ=51000264 + +13/08/07 08:35:40 INFO mapred.JobClient: HDFS_BYTES_READ=50000218 + +13/08/07 08:35:40 INFO mapred.JobClient: FILE_BYTES_WRITTEN=102162937 + +13/08/07 08:35:40 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=50000000 + +13/08/07 08:35:40 INFO mapred.JobClient: Map-Reduce Framework + +13/08/07 08:35:40 INFO mapred.JobClient: Map output materialized bytes=51000012 + +13/08/07 08:35:40 INFO mapred.JobClient: Map input records=500000 + +13/08/07 08:35:40 INFO mapred.JobClient: Reduce shuffle bytes=51000012 + +13/08/07 08:35:40 INFO mapred.JobClient: Spilled Records=1000000 + +13/08/07 08:35:40 INFO mapred.JobClient: Map output bytes=50000000 + +13/08/07 08:35:40 INFO mapred.JobClient: Total committed heap usage (bytes)=602996736 + +13/08/07 08:35:40 INFO mapred.JobClient: CPU time spent (ms)=6660 + +13/08/07 08:35:40 INFO mapred.JobClient: Map input bytes=50000000 + +13/08/07 08:35:40 INFO mapred.JobClient: SPLIT_RAW_BYTES=218 + +13/08/07 08:35:40 INFO mapred.JobClient: Combine input records=0 + +13/08/07 08:35:40 INFO mapred.JobClient: Reduce input records=500000 + +13/08/07 08:35:40 INFO mapred.JobClient: Reduce input groups=500000 + +13/08/07 08:35:40 INFO mapred.JobClient: Combine output records=0 + +13/08/07 08:35:40 INFO mapred.JobClient: Physical memory (bytes) snapshot=611500032 + +13/08/07 08:35:40 INFO mapred.JobClient: Reduce output records=500000 + +13/08/07 08:35:40 INFO mapred.JobClient: Virtual memory (bytes) snapshot=2300420096 + +13/08/07 08:35:40 INFO mapred.JobClient: Map output records=500000 + +13/08/07 08:35:40 INFO terasort.TeraSort: done + + + +Counters: 30 + + Job Counters + + Launched reduce tasks=1 + + SLOTS_MILLIS_MAPS=6390 + + + Total time spent by all reduces waiting after reserving slots (ms)=0 + + Total time spent by all maps waiting after reserving slots (ms)=0 + + Rack-local map tasks=2 + + Launched map tasks=2 + + SLOTS_MILLIS_REDUCES=9240 + + + File Input Format Counters + + Bytes Read=50000000 + + File Output Format Counters + + Bytes Written=50000000 + + FileSystemCounters + + FILE_BYTES_READ=51000264 + + HDFS_BYTES_READ=50000218 + + FILE_BYTES_WRITTEN=102162937 + + + HDFS_BYTES_WRITTEN=50000000 + + Map-Reduce Framework + + Map output materialized bytes=51000012 + + Map input records=500000 + + Reduce shuffle bytes=51000012 + + Spilled Records=1000000 + + Map output bytes=50000000 + + Total committed heap usage (bytes)=602996736 + + CPU time spent (ms)=6660 + + + Map input bytes=50000000 + + SPLIT_RAW_BYTES=218 + + Combine input records=0 + + Reduce input records=500000 + + Reduce input groups=500000 + + Combine output records=0 + + Physical memory (bytes) snapshot=611500032 + + Reduce output records=500000 + + Virtual memory (bytes) snapshot=2300420096 + + Map output records=500000 + + + +---- nn bench ---- + + +hadoop jar hadoop-test-1.1.2.jar nnbench -operation create_write -maps 2 -reduces 1 -blockSize 1 -bytesToWrite 20 -bytesPerChecksum 1 -numberOfFiles 100 -replicationFactorPerFile 1 + + +13/08/07 08:42:43 INFO hdfs.NNBench: -------------- NNBench -------------- : + +13/08/07 08:42:43 INFO hdfs.NNBench: Version: NameNode Benchmark 0.4 + +13/08/07 08:42:43 INFO hdfs.NNBench: Date & time: 2013-08-07 08:42:43,678 + +13/08/07 08:42:43 INFO hdfs.NNBench: + +13/08/07 08:42:43 INFO hdfs.NNBench: Test Operation: create_write + +13/08/07 08:42:43 INFO hdfs.NNBench: Start time: 2013-08-07 08:42:29,426 + +13/08/07 08:42:43 INFO hdfs.NNBench: Maps to run: 2 + +13/08/07 08:42:43 INFO hdfs.NNBench: Reduces to run: 1 + +13/08/07 08:42:43 INFO hdfs.NNBench: Block Size (bytes): 1 + +13/08/07 08:42:43 INFO hdfs.NNBench: Bytes to write: 20 + +13/08/07 08:42:43 INFO hdfs.NNBench: Bytes per checksum: 1 + +13/08/07 08:42:43 INFO hdfs.NNBench: Number of files: 100 + +13/08/07 08:42:43 INFO hdfs.NNBench: Replication factor: 1 + +13/08/07 08:42:43 INFO hdfs.NNBench: Successful file operations: 200 + +13/08/07 08:42:43 INFO hdfs.NNBench: + +13/08/07 08:42:43 INFO hdfs.NNBench: # maps that missed the barrier: 0 + +13/08/07 08:42:43 INFO hdfs.NNBench: # exceptions: 0 + +13/08/07 08:42:43 INFO hdfs.NNBench: + +13/08/07 08:42:43 INFO hdfs.NNBench: TPS: Create/Write/Close: 90 + +13/08/07 08:42:43 INFO hdfs.NNBench: Avg exec time (ms): Create/Write/Close: 42.665 + +13/08/07 08:42:43 INFO hdfs.NNBench: Avg Lat (ms): Create/Write: 3.015 + +13/08/07 08:42:43 INFO hdfs.NNBench: Avg Lat (ms): Close: 39.61 + +13/08/07 08:42:43 INFO hdfs.NNBench: + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: AL Total #1: 603 + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: AL Total #2: 7922 + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: TPS Total (ms): 8533 + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: Longest Map Time (ms): 4437.0 + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: Late maps: 0 + +13/08/07 08:42:43 INFO hdfs.NNBench: RAW DATA: # of exceptions: 0 + +13/08/07 08:42:43 INFO hdfs.NNBench: + + + +Conclusion + + +It has been shown that running Hadoop NameNode on a Kove XPD improves cluster reliability and removes the memory size limitation usual for the RAM-based NameNode. + + +Planned enhancements include making fuller utilitzation of all the capabilities of the Kove XPD, described here http://kove.com/, such as its fast block copy of terabytes of data in a matter of seconds. \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..c8dfe8d64e2c402e7860c1ac020527c429887335 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Hadoop HDFS/Hadoop clusters with Kove\302\256 XPD\342\204\242 persistent memory.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6ac402123412e806c28ad637e9d49b6d57598eb --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt @@ -0,0 +1,294 @@ +select_all Edge AppSpace +SolutionHub +Performance / CDN +Security +Virtual Waiting Room +A/B Testing +Search AppSpace +AppStack +Node.js Edge Hosting +RunStack +Containers +Serverless +gps_fixed Core Platform +Section Control Plane +Edge AppSpace +Adaptive Edge Engine (AEE) +Global Edge Network +Solutions +SaaS +PaaS & Hosting Providers +Edge App Hosting +Docs +Resources +Blog +Case Studies +Edge Content Library +Solution Briefs +Product Videos +Engineering Education +About Section +Partners +Changelog +Pricing +Platform +select_allEdge AppSpace +SolutionHub +Performance / CDN +Security +Virtual Waiting Room +A/B Testing +AppStack +Node.js Edge Hosting +RunStack +Containers +Serverless +Search AppSpace +gps_fixedCore Platform +Section Control Plane +Edge AppSpace +Adaptive Edge Engine (AEE) +Global Edge Network +Docs +Resources +Blog +Case Studies +Content Library +Solution Briefs +Changelog +Engineering Education +Partners +About Section +Pricing +Contact +Log In +Get Started +Key Design of HDFS Architecture +March 31, 2021 +HDFS (Hadoop Distributed File System) is a big data distributed file system storage by Apache. It is implemented within the Hadoop framework and it needs to have several features of design implemented to work effectively in processing, distributing, and storing big data. + +HDFS (Hadoop Distributed File System) is similar to other distributed file systems except for some key differences such as, fault-tolerance, high throughput, and ability to be deployed on low-cost hardware. + +Overview +This article will cover: + +Introduction to Big data framework +General design of HDFS architecture +Configuring HDFS data storage policies +Colocation and its benefits in HDFS +Introduction to Big data framework +Big data is data in sets that are of high volume and complexity beyond what traditional data processing software applications can deal with. Big data framework is characterized by 4Vs namely: + +Variety (data is of various forms and types) +Velocity (data processing speed is high) +Value (low data value density) +Volume (massive amount of data) +Apache Hadoop is among the frameworks that can do the processing of data with the characteristics described above. Within the Hadoop framework is the Hadoop Distributed File System (HDFS). + +HDFS is a distributed file system of the Hadoop technical framework that was developed based on the Google File System (GFS) and is used to manage files on multiple independent physical servers. + +It is applied in the following scenarios: + +Ecosystem data storage. +Website user data behavior data storage. +Meteorological data storage. +General design of HDFS architecture +The HDFS has design features of its architecture that enable its efficient working among which are the following: + +Federation storage: HDFS creates a distinction between the namespace and storage. The two are separated to create a block storage layer. + +High Availability: HDFS supports features such as data replication which enhances the availability of the system. A single block of data is replicated in 3 nodes, so that even if a single node fail, a client can access the block from 2 other nodes. + +Data can still be accessed normally even when a failure occurs on the ‘DataNode’ or ‘NameNode’. + +A ‘NameNode’ is a primary component within the HDFS framework that stores meta-data of files, manages and maintains ‘DataNodes’, and assigns them tasks. It is also known as the master node. + +A ‘DataNode’ is a node that stores the actual data within HDFS and does creation, deletion, and replication of data blocks. It also serves read and write requests for clients and is usually known as the slave node. + +Multiple access modes: Within HDFS data can be accessed through HTTP on an HTTP browser, Java API for applications, or any other command shells. + +Space reclamation: Space that had been released in HDFS can be reclaimed. This is implemented by a recycle bin mechanism where data that had been deleted can be restored from the recycle bin to occupy its initial space. The number of replicas can also be dynamically set. + +NameNode/DataNode in master/slave mode: HDFS consists of NameNodes and DataNodes that work in a master/slave architecture. A single cluster consists of only one NameNode which regulates data access by clients and manages the namespace within the file system. + +The DataNode receives instructions from the NameNode on when to create, delete, and replicate data blocks. + +Unified file system Namespace: HDFS is presented externally as a coherent file system. Any external process perceives the system as one unified system. + +Data replication: In HDFS, a file’s blocks are replicated for fault tolerance and the number of replicas can be specified by an application. This can be done at creation time but is subject to change at will. + +Metadata persistence: The HDFS NameNode stores the namespace. The NameNode consistently records every change that occurs in file system metadata in a transaction log file called the ‘EditLog’. + +Whenever a new file is created in HDFS, the NameNode inserts a record into the ‘EditLog’ indicating the creation of the new file. + +This information is also synchronized between the active and the standby NameNode periodically. + +Robustness: HDFS stores data reliably even when a failure occurs. + +Its robustness takes into account the 3 common types of failures: + +DataNode failure +NameNode failure +Network failure +Data organization: Data is stored by blocks of size 64MB in HDFS. + +HDFS Data Integrity Assurance: HDFS ensures the completeness of the stored data by implementing reliability processing in case of failure of each component. + +HDFS accomplishes this by doing the following: + +Reconstructing data replicas in invalid data disks - the DataNode periodically reports blocks’ messages to the NameNode, if one replica (block) fails, the NameNode will start the procedure to recover lost replicas. +Ensures data balance among DataNodes - the HDFS architecture is configured with the data balance mechanism, which ensures the even distribution of data among all DataNodes. +Ensures metadata reliability - the transaction log mechanism is used to operate metadata, which is stored on both active and standby NameNodes. The snapshot mechanism of the file system ensures that data can be recovered promptly when a misoperation occurs. +Provides the security mode - HDFS provides a unique security mode to prevent a fault from spreading when a DataNode or hard disk is faulty. +Data storage policy: HDFS supports 5 storage policies namely: + +Hot – Storage on DISK. +Warm – Storage on both DISK and ARCHIVE. +Cold – Storage on ARCHIVE. +One_SSD – Storage of a single replica on SSD and other replicas on DISK. +All_SSD – Storage of all replicas on SSD. +Configuring HDFS data storage policies +The HDFS NameNode automatically selects DataNodes to store data replicas by default. + +This can be done in the following scenarios: + +Layered storage +Select a proper storage device for layered data storage from multiple devices on a DataNode. + +The HDFS layered storage architecture provides four types of storage devices: + +RAM_DISK (memory virtualization hard disk) +DISK (mechanical hard disk) +ARCHIVE (high-density and low-cost storage media) +SSD (solid-state disk) +To formulate storage policies for different scenarios, the four types of storage devices are combined. + +Tag storage +Select a proper DataNode according to directory tags which indicates data importance levels. + +Node Group Storage +Stores key data in highly reliable node groups because the DataNode cluster uses heterogeneous servers. + +Colocation and its benefits in HDFS +Colocation is the storage of associated data or data that will be associated on the same storage node. + +This is implemented as a solution to the great consumption of network resources during a massive migration of data that affects the processing speed of massive data and system performance greatly. + +Benefits of colocation +Reduces network bandwidth and resource consumption. +Enhances easy and quick access to data. +To wrap up +As mentioned earlier HDFS is similar to other distributed file systems except for some distinct differences that serve as strengths of HDFS over other Distributed File Systems. + +These distinct differences are, its fault-tolerance, its high throughput, and its ability to be deployed on low-cost hardware and support large data sets. + +Happy learning. + +Relevant resources +HDFS Architecture Guide +Characteristics of HDFS +Big Data Huawei +Peer Review Contributions by: Srishilesh P S + +About the author + Ruth Mare +Ruth is an Undergraduate Computer Science student at Kenyatta University. She is passionate about Computer and Cloud networks, Information security, Machine Learning and Artificial Intelligence. She is open to research and collaborations. + +This article was contributed by a student member of Section's Engineering Education Program. Please report any errors or innaccuracies to enged@section.io. +Want to learn more about the EngEd Program? +Discover Section's community-generated pool of resources from the next generation of engineers. + +Learn more +QUICK LINKS // More Section offerings +Edge Modules +Varnish Cache +Nginx/Lua +SiteSpect +Optidash +Cloudinary +ModSecurity +SignalSciences +ThreatX +Wallarm +Snapt +PerimeterX +Radware Bot Manager +Content Security Policy +Virtual Waiting Room +Hugo +Node.js +Custom Workload +View All Modules +DevOps +Real Time Metrics +Log Management +Real User Monitoring +Instant Global Deployments +Developer PoP +Instant Cache Purge +Managed SSL Certificates +APIs +Endpoints +Global Edge Network +Custom Edge Network +Private Edge Network +Origin PoP +Performance & Scalability +Dynamic Content Caching +Static Asset Caching +HTML Streaming +Anonymous Page Caching +Image Optimization +Mobile Optimization +Virtual Waiting Room +HTTP/2 +Edge Delivery +Load Balancing +Maintenance Pages +Anycast DNS Hosting +SSL Certificates +Static Site Deployment +Application Security +Web Application Firewall +IP Blocking +SSL Certificates +DDoS Mitigation +Bad Bot Management +Content Security Policy +Use Cases +SaaS +PaaS & Hosting Providers +Edge App Hosting +Enterprise +E-Commerce +Gaming +IoT/IIoT +BigCommerce +Magento +WordPress +Drupal +Join our Slack community +Add to Slack +Company +About +Careers +Legals +Resources +Blog +Case Studies +Content Library +Solution Briefs +Partners +Changelog +Support +Docs +Community Slack +Help & Support +Platform Status +Pricing +Section supports many open source projects including: + +varnish cache logo +cloud native computing foundation logo +the linux foundation logo +lf edge logo diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..f76135c40e11237c2acb283adfb44aac93ba9584 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/Key Design of HDFS Architecture.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cbf2094b3a61046925de9fe63866eea634391a5 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt @@ -0,0 +1,380 @@ +The Hadoop Distributed File System: +Architecture and Design +by Dhruba Borthakur +Table of contents +1 Introduction .......................................................................................................................3 +2 Assumptions and Goals .....................................................................................................3 +2.1 Hardware Failure........................................................................................................... 3 +2.2 Streaming Data Access .................................................................................................3 +2.3 Large Data Sets .............................................................................................................3 +2.4 Simple Coherency Model ............................................................................................. 3 +2.5 Moving computation is cheaper than moving data .......................................................4 +2.6 Portability across Heterogeneous Hardware and Software Platforms ..........................4 +3 Namenode and Datanode .................................................................................................. 4 +4 The File System Namespace .............................................................................................5 +5 Data Replication ................................................................................................................5 +5.1 Replica Placement . The First Baby Steps ....................................................................5 +5.2 Replica Selection .......................................................................................................... 6 +5.3 SafeMode ......................................................................................................................6 +6 The Persistence of File System Metadata ......................................................................... 7 +7 The Communication Protocol ........................................................................................... 8 +8 Robustness ........................................................................................................................ 8 +8.1 Data Disk Failure, Heartbeats and Re-Replication .......................................................8 +8.2 Cluster Rebalancing ......................................................................................................8 +8.3 Data Correctness ...........................................................................................................8 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +8.4 Metadata Disk Failure .................................................................................................. 9 +8.5 Snapshots ......................................................................................................................9 +9 Data Organization ............................................................................................................. 9 +9.1 Data Blocks .................................................................................................................. 9 +9.2 Staging ........................................................................................................................10 +9.3 Pipelining ....................................................................................................................10 +10 Accessibility .................................................................................................................. 10 +10.1 DFSShell ...................................................................................................................11 +10.2 DFSAdmin ................................................................................................................11 +10.3 Browser Interface ......................................................................................................11 +11 Space Reclamation ........................................................................................................ 11 +11.1 File Deletes and Undelete ......................................................................................... 11 +11.2 Decrease Replication Factor ..................................................................................... 12 +12 References ..................................................................................................................... 12 +The Hadoop Distributed File System: Architecture and Design +Page 2 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +1. Introduction +The Hadoop File System (HDFS) is as a distributed file system running on commodity +hardware. It has many similarities with existing distributed file systems. However, the +differences from other distributed file systems are significant. HDFS is highly fault-tolerant +and can be deployed on low-cost hardware. HDFS provides high throughput access to +application data and is suitable for applications that have large datasets. HDFS relaxes a few +POSIX requirements to enable streaming access to file system data. HDFS was originally +built as infrastructure for the open source web crawler Apache Nutch project. HDFS is part +of the Hadoop Project, which is part of the Lucene Apache Project. The Project URL is here. +2. Assumptions and Goals +2.1. Hardware Failure +Hardware Failure is the norm rather than the exception. The entire HDFS file system may +consist of hundreds or thousands of server machines that stores pieces of file system data. +The fact that there are a huge number of components and that each component has a +non-trivial probability of failure means that some component of HDFS is always +non-functional. Therefore, detection of faults and automatically recovering quickly from +those faults are core architectural goals of HDFS. +2.2. Streaming Data Access +Applications that run on HDFS need streaming access to their data sets. They are not general +purpose applications that typically run on a general purpose file system. HDFS is designed +more for batch processing rather than interactive use by users. The emphasis is on throughput +of data access rather than latency of data access. POSIX imposes many hard requirements +that are not needed for applications that are targeted for HDFS. POSIX semantics in a few +key areas have been traded off to further enhance data throughout rates. +2.3. Large Data Sets +Applications that run on HDFS have large data sets. This means that a typical file in HDFS is +gigabytes to terabytes in size. Thus, HDFS is tuned to support large files. It should provide +high aggregate data bandwidth and should scale to hundreds of nodes in a single cluster. It +should support tens of millions of files in a single cluster. +2.4. Simple Coherency Model +The Hadoop Distributed File System: Architecture and Design +Page 3 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +Most HDFS applications need write-once-read-many access model for files. A file once +created, written and closed need not be changed. This assumption simplifies data coherency +issues and enables high throughout data access. A Map-Reduce application or a +Web-Crawler application fits perfectly with this model. There is a plan to support +appending-writes to a file in future. +2.5. Moving computation is cheaper than moving data +A computation requested by an application is most optimal if the computation can be done +near where the data is located. This is especially true when the size of the data set is huge. +This eliminates network congestion and increase overall throughput of the system. The +assumption is that it is often better to migrate the computation closer to where the data is +located rather than moving the data to where the application is running. HDFS provides +interfaces for applications to move themselves closer to where the data is located. +2.6. Portability across Heterogeneous Hardware and Software Platforms +HDFS should be designed in such a way that it is easily portable from one platform to +another. This facilitates widespread adoption of HDFS as a platform of choice for a large set +of applications. +3. Namenode and Datanode +HDFS has a master/slave architecture. An HDFS cluster consists of a single Namenode, a +master server that manages the filesystem namespace and regulates access to files by clients. +In addition, there are a number of Datanodes, one per node in the cluster, which manage +storage attached to the nodes that they run on. HDFS exposes a file system namespace and +allows user data to be stored in files. Internally, a file is split into one or more blocks and +these blocks are stored in a set of Datanodes. The Namenode makes filesystem namespace +operations like opening, closing, renaming etc. of files and directories. It also determines the +mapping of blocks to Datanodes. The Datanodes are responsible for serving read and write +requests from filesystem clients. The Datanodes also perform block creation, deletion, and +replication upon instruction from the Namenode. +The Namenode and Datanode are pieces of software that run on commodity machines. These +machines are typically commodity Linux machines. HDFS is built using the Java language; +any machine that support Java can run the Namenode or the Datanode. Usage of the highly +portable Java language means that HDFS can be deployed on a wide range of machines. A +typical deployment could have a dedicated machine that runs only the Namenode software. +Each of the other machines in the cluster runs one instance of the Datanode software. The +The Hadoop Distributed File System: Architecture and Design +Page 4 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +architecture does not preclude running multiple Datanodes on the same machine but in a +real-deployment that is never the case. +The existence of a single Namenode in a cluster greatly simplifies the architecture of the +system. The Namenode is the arbitrator and repository for all HDFS metadata. The system is +designed in such a way that user data never flows through the Namenode. +4. The File System Namespace +HDFS supports a traditional hierarchical file organization. A user or an application can create +directories and store files inside these directories. The file system namespace hierarchy is +similar to most other existing file systems. One can create and remove files, move a file from +one directory to another, or rename a file. HDFS does not yet implement user quotas and +access permissions. HDFS does not support hard links and soft links. However, the HDFS +architecture does not preclude implementing these features at a later time. +The Namenode maintains the file system namespace. Any change to the file system +namespace and properties are recorded by the Namenode. An application can specify the +number of replicas of a file that should be maintained by HDFS. The number of copies of a +file is called the replication factor of that file. This information is stored by the Namenode. +5. Data Replication +HDFS is designed to reliably store very large files across machines in a large cluster. It stores +each file as a sequence of blocks; all blocks in a file except the last block are the same size. +Blocks belonging to a file are replicated for fault tolerance. The block size and replication +factor are configurable per file. Files in HDFS are write-once and have strictly one writer at +any time. An application can specify the number of replicas of a file. The replication factor +can be specified at file creation time and can be changed later. +The Namenode makes all decisions regarding replication of blocks. It periodically receives +Heartbeat and a Blockreport from each of the Datanodes in the cluster. A receipt of a +heartbeat implies that the Datanode is in good health and is serving data as desired. A +Blockreport contains a list of all blocks on that Datanode. +5.1. Replica Placement . The First Baby Steps +The selection of placement of replicas is critical to HDFS reliability and performance. This +feature distinguishes HDFS from most other distributed file systems. This is a feature that +needs lots of tuning and experience. The purpose of a rack-aware replica placement is to +improve data reliability, availability, and network bandwidth utilization. The current +The Hadoop Distributed File System: Architecture and Design +Page 5 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +implementation for the replica placement policy is a first effort in this direction. The +short-term goals of implementing this policy are to validate it on production systems, learn +more about its behavior and build a foundation to test and research more sophisticated +policies in the future. +HDFS runs on a cluster of computers that spread across many racks. Communication +between two nodes on different racks has to go through switches. In most cases, network +bandwidth between two machines in the same rack is greater than network bandwidth +between two machines on different racks. +At startup time, each Datanode determines the rack it belongs to and notifies the Namenode +of the rack id upon registration. HDFS provides APIs to facilitate pluggable modules that can +be used to determine the rack identity of a machine. A simple but non-optimal policy is to +place replicas across racks. This prevents losing data when an entire rack fails and allows use +of bandwidth from multiple racks when reading data. This policy evenly distributes replicas +in the cluster and thus makes it easy to balance load on component failure. However, this +policy increases the cost of writes because a write needs to transfer blocks to multiple racks. +For the most common case when the replica factor is three, HDFS.s placement policy is to +place one replica on the local node, place another replica on a different node at the local rack, +and place the last replica on different node at a different rack. This policy cuts the inter-rack +write traffic and improves write performance. The chance of rack failure is far less than that +of node failure; this policy does not impact data reliability and availability guarantees. But it +reduces the aggregate network bandwidth when reading data since a block is placed in only +two unique racks rather than three. The replicas of a file do not evenly distribute across the +racks. One third of replicas are on one node, two thirds of the replicas are on one rack; the +other one third of replicas is evenly distributed across all the remaining racks. This policy +improves write performance while not impacting data reliability or read performance. +The implementation of the above policy is work-in-progress. +5.2. Replica Selection +HDFS tries to satisfy a read request from a replica that is closest to the reader. If there exists +a replica on the same rack as the reader node, then that replica is preferred to satisfy the read +request. If a HDFS cluster spans multiple data centers, then a replica that is resident in the +local data center is preferred over remote replicas. +5.3. SafeMode +On startup, the Namenode enters a special state called Safemode. Replication of data blocks +does not occur when the Namenode is in Safemode state. The Namenode receives Heartbeat +The Hadoop Distributed File System: Architecture and Design +Page 6 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +and Blockreport from the Datanodes. A Blockreport contains the list of data blocks that a +Datanode reports to the Namenode. Each block has a specified minimum number of replicas. +A block is considered safely-replicated when the minimum number of replicas of that data +block has checked in with the Namenode. When a configurable percentage of +safely-replicated data blocks checks in with the Namenode (plus an additional 30 seconds), +the Namenode exits the Safemode state. It then determines the list of data blocks (if any) that +have fewer than the specified number of replicas. The Namenode then replicates these blocks +to other Datanodes. +6. The Persistence of File System Metadata +The HDFS namespace is stored by the Namenode. The Namenode uses a transaction log +called the EditLog to persistently record every change that occurs to file system metadata. +For example, creating a new file in HDFS causes the Namenode to insert a record into the +EditLog indicating this change. Similarly, changing the replication factor of a file causes a +new record to be inserted into the EditLog. The Namenode uses a file in its local file system +to store the Edit Log. The entire file system namespace, the mapping of blocks to files and +filesystem properties are stored in a file called the FsImage. The FsImage is a file in the +Namenode.s local file system too. +The Namenode has an image of the entire file system namespace and file Blockmap in +memory. This metadata is designed to be compact, so that a 4GB memory on the Namenode +machine is plenty to support a very large number of files and directories. When the +Namenode starts up, it reads the FsImage and EditLog from disk, applies all the transactions +from the EditLog into the in-memory representation of the FsImage and then flushes out this +new metadata into a new FsImage on disk. It can then truncate the old EditLog because its +transactions have been applied to the persistent FsImage. This process is called a checkpoint. +In the current implementation, a checkpoint occurs when the Namenode starts up. Work is in +progress to support periodic checkpointing in the near future. +The Datanode stores HDFS data into files in its local file system. The Datanode has no +knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local +file system. The Datanode does not create all files in the same directory. Instead, it uses a +heuristic to determine the optimal number of files per directory. It creates subdirectories +appropriately. It is not optimal to create all local files in the same directory because the local +file system might not be able to efficiently support a huge number of files in a single +directory. When a Datanode starts up, it scans through its local file system, generates a list of +all HDFS data blocks that correspond to each of these local files and sends this report to the +Namenode. This report is called the Blockreport. +The Hadoop Distributed File System: Architecture and Design +Page 7 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +7. The Communication Protocol +All communication protocols are layered on top of the TCP/IP protocol. A client establishes +a connection to a well-defined and configurable port on the Namenode machine. It talks the +ClientProtocol with the Namenode. The Datanodes talk to the Namenode using the +DatanodeProtocol. The details on these protocols will be explained later on. A Remote +Procedure Call (RPC) abstraction wraps the ClientProtocol and the DatanodeProtocol. By +design, the Namenode never initiates an RPC. It responds to RPC requests issued by a +Datanode or a client. +8. Robustness +The primary objective of HDFS is to store data reliably even in the presence of failures. The +three types of common failures are Namenode failures, Datanode failures and network +partitions. +8.1. Data Disk Failure, Heartbeats and Re-Replication +A Datanode sends a heartbeat message to the Namenode periodically. A network partition +can cause a subset of Datanodes to lose connectivity with the Namenode. The Namenode +detects this condition be a lack of heartbeat message. The Namenode marks these Datanodes +as dead and does not forward any new IO requests to these Datanodes. The data that was +residing on those Datanodes are not available to HDFS any more. This may cause the +replication factor of some blocks to fall below their specified value. The Namenode +determines all the blocks that need to be replicated and starts replicating them to other +Datanodes. The necessity for re-replication may arise due to many reasons: a Datanode +becoming unavailable, a corrupt replica, a bad disk on the Datanode or an increase of the +replication factor of a file. +8.2. Cluster Rebalancing +The HDFS architecture is compatible with data rebalancing schemes. It is possible that data +may move automatically from one Datanode to another if the free space on a Datanode falls +below a certain threshold. Also, a sudden high demand for a particular file can dynamically +cause creation of additional replicas and rebalancing of other data in the cluster. These types +of rebalancing schemes are not yet implemented. +8.3. Data Correctness +It is possible that a block of data fetched from a Datanode is corrupted. This corruption can +The Hadoop Distributed File System: Architecture and Design +Page 8 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +occur because of faults in the storage device, a bad network or buggy software. The HDFS +client implements checksum checking on the contents of a HDFS file. When a client creates a +HDFS file, it computes a checksum of each block on the file and stores these checksums in a +separate hidden file in the same HDFS namespace. When a client retrieves file contents it +verifies that the data it received from a Datanode satisfies the checksum stored in the +checksum file. If not, then the client can opt to retrieve that block from another Datanode that +has a replica of that block. +8.4. Metadata Disk Failure +The FsImage and the EditLog are central data structures of HDFS. A corruption of these files +can cause the entire cluster to be non-functional. For this reason, the Namenode can be +configured to support multiple copies of the FsImage and EditLog. Any update to either the +FsImage or EditLog causes each of the FsImages and EditLogs to get updated synchronously. +This synchronous updating of multiple EditLog may degrade the rate of namespace +transactions per second that a Namenode can support. But this degradation is acceptable +because HDFS applications are very data intensive in nature; they are not metadata intensive. +A Namenode, when it restarts, selects the latest consistent FsImage and EditLog to use. +The Namenode machine is a single point of failure for the HDFS cluster. If a Namenode +machine fails, manual intervention is necessary. Currently, automatic restart and failover of +the Namenode software to another machine is not supported. +8.5. Snapshots +Snapshots support storing a copy of data at a particular instant of time. One usage of the +snapshot-feature may be to roll back a corrupted cluster to a previously known good point in +time. HDFS current does not support snapshots but it will be supported it in future release. +9. Data Organization +9.1. Data Blocks +HDFS is designed to support large files. Applications that are compatible with HDFS are +those that deal with large data sets. These applications write the data only once; they read the +data one or more times and require that reads are satisfied at streaming speeds. HDFS +supports write-once-read-many semantics on files. A typical block size used by HDFS is 64 +MB. Thus, a HDFS file is chopped up into 128MB chunks, and each chunk could reside in +different Datanodes. +The Hadoop Distributed File System: Architecture and Design +Page 9 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +9.2. Staging +A client-request to create a file does not reach the Namenode immediately. In fact, the HDFS +client caches the file data into a temporary local file. An application-write is transparently +redirected to this temporary local file. When the local file accumulates data worth over a +HDFS block size, the client contacts the Namenode. The Namenode inserts the file name into +the file system hierarchy and allocates a data block for it. The Namenode responds to the +client request with the identity of the Datanode(s) and the destination data block. The client +flushes the block of data from the local temporary file to the specified Datanode. When a file +is closed, the remaining un-flushed data in the temporary local file is transferred to the +Datanode. The client then instructs the Namenode that the file is closed. At this point, the +Namenode commits the file creation operation into a persistent store. If the Namenode dies +before the file is closed, the file is lost. +The above approach has been adopted after careful consideration of target applications that +run on HDFS. Applications need streaming writes to files. If a client writes to a remote file +directly without any client side buffering, the network speed and the congestion in the +network impacts throughput considerably. This approach is not without precedence either. +Earlier distributed file system, e.g. AFS have used client side caching to improve +performance. A POSIX requirement has been relaxed to achieve higher performance of data +uploads. +9.3. Pipelining +When a client is writing data to a HDFS file, its data is first written to a local file as +explained above. Suppose the HDFS file has a replication factor of three. When the local file +accumulates a block of user data, the client retrieves a list of Datanodes from the Namenode. +This list represents the Datanodes that will host a replica of that block. The client then +flushes the data block to the first Datanode. The first Datanode starts receiving the data in +small portions (4 KB), writes each portion to its local repository and transfers that portion to +the second Datanode in the list. The second Datanode, in turn, starts receiving each portion of +the data block, writes that portion to its repository and then flushes that portion to the third +Datanode. The third Datanode writes the data to its local repository. A Datanode could be +receiving data from the previous one in the pipeline and at the same time it could be +forwarding data to the next one in the pipeline. Thus, the data is pipelined from one Datanode +to the next. +10. Accessibility +HDFS can be accessed by application by many different ways. Natively, HDFS provides a +The Hadoop Distributed File System: Architecture and Design +Page 10 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +Java API for applications to use. A C language wrapper for this Java API is available. A +HTTP browser can also be used to browse the file in HDFS. Work is in progress to expose a +HDFS content repository through the WebDAV Protocol. +10.1. DFSShell +HDFS allows user data to be organized in the form of files and directories. It provides an +interface called DFSShell that lets a user interact with the data in HDFS. The syntax of this +command set is similar to other shells (e.g. bash, csh) that users are already familiar with. +Here are some sample commands: +Create a directory named /foodir : hadoop dfs -mkdir /foodir +View a file /foodir/myfile.txt : hadoop dfs -cat /foodir/myfile.txt +Delete a file /foodir/myfile.txt : hadoop dfs -rm /foodir myfile.txt +The command syntax for DFSShell is targeted for applications that need a scripting language +to interact with the stored data. +10.2. DFSAdmin +The DFSAdmin command set is used for administering a dfs cluster. These are commands +that are used only by a HDFS administrator. Here are some sample commands: +Put a cluster in Safe Mode : bin/hadoop dfsadmin -safemode enter +Generate a list of Datanodes : bin/hadoop dfsadmin -report +Decommission a Datanode : bin/hadoop dfsadmin -decommission datanodename +10.3. Browser Interface +A typical HDFS install configures a web-server to expose the HDFS namespace through a +configurable port. This allows a Web browser to navigate the HDFS namespace and view +contents of a HDFS file. +11. Space Reclamation +11.1. File Deletes and Undelete +When a file is deleted by a user or an application, it is not immediately removed from HDFS. +HDFS renames it to a file in the /trash directory. The file can be restored quickly as long as it +The Hadoop Distributed File System: Architecture and Design +Page 11 +Copyright © 2005 The Apache Software Foundation. All rights reserved. +remains in /trash. A file remains in /trash for a configurable amount of time. After the expiry +of its life in /trash, the Namenode deletes the file from the HDFS namespace. The deletion of +the file causes the blocks associated with the file to be freed. There could be an appreciable +time delay between the time a file is deleted by a user and the time of the corresponding +increase in free space in HDFS. +A user can Undelete a file after deleting it as long as it remains in the /trash directory. If a +user wants to undelete a file that he/she has deleted, he/she can navigate the /trash directory +and retrieve the file. The /trash directory contains only the latest copy of the file that was +deleted. The /trash directory is just like any other directory with one special feature: HDFS +applies specified policies to automatically delete files from this directory. The current default +policy is to delete files that are older than 6 hours. In future, this policy will be configurable +through a well defined interface. +11.2. Decrease Replication Factor +When the replication factor of a file is reduced, the Namenode selects excess replicas that can +be deleted. The next Heartbeat transfers this information to the Datanode. The Datanode then +removes the corresponding blocks and the corresponding free space appears in the cluster. +The point to note here is that there might be a time delay between the completion of the +setReplication API and the appearance of free space in the cluster. +12. References +Browse the HDFS Java Interface +Download the HDFS source code +The Hadoop Distributed File System: Architecture and Design +Page 12 +Copyright © 2005 The Apache Software Foundation. All rights reserved. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..9ca46f8638cdce70e740b411bfe7493707d7fadc Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/The Hadoop Distributed File System Architecture and Design.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt new file mode 100644 index 0000000000000000000000000000000000000000..22c07123044141d801714c213d49334f7f0c1328 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt @@ -0,0 +1,205 @@ +Towards A Scalable HDFS Architecture +Farag Azzedin +Information and Computer Science Department +King Fahd University of Petroleum and Minerals +Dhahran, 31261, Saudi Arabia +fazzedin@kfupm.edu.sa +Abstract—Cloud computing infrastructures allow corporations to reduce costs by outsourcing computations on-demand. One of the areas cloud computing is increasingly being utilized for is large scale data processing. Apache Hadoop is one of these large scale data processing projects that supports data-intensive distributed applications. Hadoop applications utilize a distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS architecture, by design, has only a single master node called NameNode, which manages and maintains the metadata of storage nodes, called Datanodes, in its RAM. Hence, HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture. The proposed architecture provides a distributed NameNode space eliminating the drawbacks of the current HDFS architecture. This is achieved by integrating the Chord protocol into the HDFS architecture. +Keywords-Cloud Computing Platform, Hadoop, HDFS, Chord, Distributed NameNode +I. INTRODUCTION AND RELATED WORK +Cloud computing environments can provide large-scale datacenters at reduced cost by using or integrating the service models namely software as a service (SaaS), platform as a service (PaaS) and infrastructure as a service (IaaS). This results in the increased income for cloud computing service providers and decrease costs for cloud users [1, 2]. This motivated researchers to introduce systems such as Google's MapReduce, Google File System (GFS) and Hadoop [3]. The Apache Hadoop is one of the cloud computing projects; built using Java aiming to develop open-source software for reliable and scalable data intensive distributed computing [2, 3, 4]. Yahoo!, which uses Hadoop in its applications [3], has been the major contributor to this project. +Hadoop applications utilize a distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS implementation may contain hundreds or even thousands of server machines, each storing some part of the file system’s data. This opens the door for more hardware failures because more server machines mean more hardware and thus high probability of hardware failures. This makes it inevitable that some component of HDFS is always non-functional. Thus, faults detection, alarms, and automatic prompt server recovery are fundamental architectural goals of HDFS [3, 7]. The same applies for a NameNode server in an HDFS cluster. HDFS’s performance heavily relies on the availability of single NameNode machine. Currently, automatic restart and failover of the NameNode software to another machine is not supported [3, 7]. +Hadoop applications utilize HDFS [7], which has only a single master node called NameNode, which manages and maintains the metadata of storage nodes, called Datanodes, in its RAM [7]. Hence, HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture having a NameNode which is distributed and will not suffer HDFS failure in case of a single NameNode failure. This is achieved by using Chord protocol to introduce clustering in HDFS NameNode. +Several research projects have used Chord as a basis for their research. In a peer-to-peer system, the Chord File System (CFS) stores files and metadata and uses Chord to locate storage blocks [12]. Several investigation methods have proved that Chord’s stabilization algorithms with fewer changes provide decent lookup performance, regardless of constant failure and joining of nodes [13, 17]. DNS provides a lookup service, with host names as keys and IP addresses (and other host information) as values [9]. Chord could provide the same service by hashing each host name to a key [11]. Chord-based DNS would require no special servers, while ordinary DNS relies on a set of special root servers. DNS requires manual management of the routing information (NS records) that allows clients to navigate the name server hierarchy; Chord automatically maintains the correctness of the analogous routing information [9]. DNS only works well when host names are structured to reflect administrative boundaries; Chord imposes no naming structure [9]. DNS is specialized to the task of finding named hosts or services, while Chord can also be used to find data objects that are not tied to particular machines [9]. +The rest of the paper is organized as follows. Hadoop architecture is briefly introduced in Section II. This is followed by Section III explaining how Hadoop and HDFS work. Next, we discuss the issues in current Hadoop architecture with respect to its NameNode, while Section IV outlines the problem statement and our motivation to propose a modification to the existing HDFS architecture. We also 978-1-4673-6404-1/13/$31.00 ©2013 IEEE 155 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +discuss the Chord architecture in Sections V. Section VI presents the proposed HDFS NameNode solution and architecture. We conclude and specify areas of future work in Section VII. +II. EXISTING HADOOP ARCHITECTURE +Hadoop includes several sub-projects namely Hadoop Common HDFS, and MapReduce besides several others [4]. This section briefly discusses the sub-projects of Hadoop namely Hadoop Common, HDFS and MapReduce. +Hadoop Common: Hadoop Common provides the common utilities to support the other Hadoop subprojects and enables entry point to the Hadoop filesystems [3, 4]. Hadoop Common contains Filesystem, RPC, serialization libraries, some required jar files and scripts to initiate Hadoop [3]. Also, Hadoop Common provides source code, documents, and a contribution area that contains other Hadoop community projects [3]. +HDFS: Although Hadoop supports other storage filesystems to store datasets including Amazon S3 filesystem, CloudStore (a.k.a. Kosmos Distributed File System), FTP filesystem, and Read-only HTTP and HTTPS filesystems [3], HDFS is its primary storage and rack-aware filesystem, that is used by its applications [3, 4]. HDFS is designed to work with large data sets requiring tens of petabytes of storage. HDFS operates on top of the filesystems of the underlying OS [3]. HDFS is written in Java language [3]. HDFS is highly fault-tolerant and is designed to be deployed on low-cost hardware [7]. +HDFS has master/slave architecture. A typical Hadoop cluster is mainly comprised of a NameNode and several Datanode machines. The NameNode manages the HDFS namespace and regulates access to files that are requested by clients [7]. Datanodes, which manage storage attached to the nodes that they run on, store the actual data [6, 7]. +The NameNode and Datanode are software programs designed to run on everyday use machines. These machines typically run on a GNU/Linux OS. HDFS can be run on any machine that supports Java and therefore can run either a NameNode or the Datanode software. Usage of the highly portable and all pervasive Java language means that HDFS can be deployed on a wide range of machines. A typical deployment has a dedicated machine that runs only the NameNode software. Each of the other machines in the cluster runs one instance of the Datanode software. The architecture does not prevent running multiple Datanodes on the same machine but, in practice, that is rarely the case [7]. +MapReduce: For huge data sets of distributed applications, MapReduce is well known for its simplicity and functionality [7]. It serves as an integral part of Hadoop to support distributed computing on large data sets on clusters of computers [8]. MapReduce can be applied on the data stored in either a filesystem (unstructured) or within a database (structured) [8]. During a typical ―Map‖ function, the master node accepts a major input, slices it into several minor sub-problems, and allocates them to worker nodes [8]. A worker node could repeat this process again, if needed, resulting in a multi-level tree structure [8]. Finally, the worker node processes the received problem chunk, and returns the processed data back to its master node [8]. In the "Reduce" function, the master node receives the processed sub-problems and aggregates them in some way to form the output [8]. +MapReduce has been wisely chosen to be the part of Hadoop project because it enables unnoticed distributed processing of the map and reduction operations. Hence, multiple map functions can be run in parallel, given that, each mapping operation is autonomous of the other. In reality, however, this condition is limited by the data source and/or the number of CPUs near that data. Likewise, a set of 'reducers' can be run all at the same time during the reduction phase, given that all outputs of the map operation, which share the same key, are presented to the same reducer simultaneously. Although, this procedure may look ineffective compared to other sequential algorithms, MapReduce can be functional to potentially larger datasets than "commodity" servers can handle. Hence, for instance, using MapReduce, a large server cluster can sort a petabyte of data in only a few hours. Moreover, this parallelism also enables a probability of high availability in case of a partial failure of servers or storage during the operation. That is, if one mapper or reducer fails, the work can be rescheduled, given that the input data is still available [8]. +III. HOW HADOOP AND HDFS WORK? +For productive work scheduling, usually the filesystems provide location-awareness i.e., rack name of a worker node, for instance, the rack name of the network switch [3]. This information helps Hadoop applications to execute commands on the corresponding compute nodes [3]. When the HDFS filesystem starts replicating data, they use this information to keep multiple data copies on redundant racks [3]. The overall objective here is to provide high data availability in case of rack power or switch failure [3]. HDFS generates numerous data blocks replicas and distributes them on Datanodes, throughout a cluster to accomplish reliable and extremely fast computations [5]. HDFS provides high throughput access to application data [7]. +HDFS uses the TCP/IP layer for communication and clients use RPC to communicate with each other. The HDFS can store large files (an ideal file size is a multiple of 64 MB), across multiple Datanode machines. HDFS accomplishes reliability by duplicating the data across multiple hosts, and hence does not require RAID storage on hosts. With the default replication value, 3, data is stored on three nodes: two on the same rack, and one on a different rack. Data nodes can talk to each other to rebalance data, to move copies around, and to keep the replication of data high [3]. +156 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +Figure 1 shows how Hadoop uses HDFS to function. Clients contact the single NameNode machine for file metadata or file modifications and perform actual file I/O directly with the Datanodes [6]. To easily find out the current status of a cluster, the NameNode and Datanodes use their built-in webservers [6]. The NameNode is the judge and source for all HDFS metadata [7]. The system is designed in such a way that user data never flows through the NameNode [7]. +Figure 1: How HDFS Architecture works? [7] +NameNode receives a periodical heartbeat message from each Datanode. The NameNode may get disconnected with a subset of Datanodes in case of a network partition. The NameNode declares dead to those Datanodes that do not send recent heartbeats and stops forwarding any new I/O requests to them. Data that was registered to those dead Datanodes becomes unavailable to HDFS as well. Datanode death may cause the replication factor of some blocks to fall below their specified value. The NameNode constantly tracks which blocks need to be replicated and initiates replication, whenever necessary. The requirement for re-replication may occur because of several reasons: a Datanode may become unavailable, a replica may become corrupted, a hard disk on a Datanode may fail, or the replication factor of a file may be increased [7]. +HDFS represents an HDFS namespace and enables user data to be stored in block-based file chunks over Datanodes. Specifically, a file is split into one or more blocks and these blocks are stored in a set of Datanodes. The NameNode also keeps reference of these blocks in a block pool. The NameNode executes HDFS namespace operations like opening, closing, and renaming files and directories. The NameNode also maintains and find outs the mapping of blocks to Datanodes. The Datanodes are accountable for performing read and write requests from the file system’s clients. The Datanodes also perform block creation, deletion, and replication upon receiving the instructions from the NameNode [7]. +The NameNode stores the HDFS namespace. The NameNode keeps a transaction log called the Edit Log to continuously record every modification that takes place in the file system metadata. For instance, creating a new file in HDFS causes an event that asks the NameNode to insert a record into the Edit Log representing this action. Likewise, changing the replication factor of a file causes another event that asks the NameNode to insert another new record to be inserted into the Edit Log representing this action. The NameNode uses a file in its local host OS file system to store the Edit Log. The entire file system namespace, including the mapping of blocks to files and file system properties, is stored in a file called the Silage. The Silage is stored as a file in the NameNode’s local file system too [7]. +The image of the entire file system namespace and file Block map is kept in the NameNode’s system memory (RAM). This key metadata item is designed to be compact, such that a NameNode with 4GB of RAM is plenty to support a large number of files and directories. When the NameNode starts up, it reads the Silage and Edit Log from disk, applies all the transactions from the Edit Log to the in-memory representation of the Silage, and flushes out this new version into a new Silage on disk. It can then truncate the old EditLog because its transactions have been applied to the persistent FsImage. This procedure is known as a checkpoint. In the current implementation, a checkpoint only occurs when the NameNode starts up [7]. +The Datanode stores HDFS data in files in its local file system. The Datanode has no knowledge about HDFS files. It stores each block of HDFS data in a separate file in its local file system. The Datanode does not create all files in the same directory. Instead, it uses heuristics to determine the optimal number of files per directory and creates subdirectories appropriately. It is not optimal to create all local files in the same directory because the local file system might not be able to efficiently support a huge number of files in a single directory. When a Datanode starts up, it scans through its local file system, generates a list of all HDFS data blocks that correspond to each of these local files and sends this report to the NameNode: this is the Blockreport [7]. +The NameNode does not receive the client request to create a file. In reality, the HDFS client, initially, caches the file data into a temporary local file. Application writes are transparently redirected to this temporary local file. When the local file accumulates data worth over one HDFS block size, the client contacts the NameNode. The NameNode inserts the file name into the file system hierarchy and allocates a data block for it. The NameNode responds to the client request with the identity of the Datanode and the destination data block. Then the client flushes the block of data from the local temporary file to the specified Datanode. When a file is closed, the remaining un-flushed data in the temporary local file is transferred to the Datanode. The client then informs the NameNode that the file is closed. At this point, the NameNode commits the file creation operation into its persistent store. If the NameNode dies before the file is closed, the file is lost [7]. +157 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +IV. PROBLEM STATEMENT AND MOTIVATION +The usage of a single NameNode machine in a cluster greatly simplifies the architecture of Hadoop [7]. However, this simplicity comes at some cost, namely scalability and high availability issues. +Scalability Issue: An HDFS NameNode server RAM size limits the metadata and thus the maximum number of Datanodes, and/or relative transactional metadata, that are supported in an HDFS cluster [7]. +High Availability Issue: An HDFS needs one distinctive manager/controller server machine, the NameNode. This is a single point-of-failure for an HDFS implementation. Once this NameNode fails, the HDFS goes offline. After it gets back online, it must respond to all outstanding client requests and Datanode manage operations. The NameNode server restoration process can take over half an hour for a large cluster. The HDFS also includes a Secondary Namenode, which could be misleading to some people that after the Primary NameNode server fails, the Secondary NameNode gets active and takes over. In reality, the Secondary NameNode function is just there to build periodic image-based snapshots of the Primary NameNode's directory information and save them to local/remote directories. These image-based checkpoints can only be used to restart a failed Primary NameNode without having to replay the entire journal of HDFS actions, the edit log to create an up-to-date directory structure [3]. +The above-mentioned issues in HDFS NameNode architecture motivated us to find a way to make the NameNode seamlessly highly available, scalable and thus improving HDFS performance. Chord protocol and its applications can provide a reliable and efficient solution to these problems. This paper proposes a solution to these problems by modifying the existing HDFS NameNode architecture. +V. CHORD PROTOCOL +The functional of Chord protocol is primitive: for a unique key, it maps this key to a node [9]. This node, depending on the application using Chord, could be in charge for storing a corresponding value for its key [9]. Chord employs consistent hashing [10] to allocate keys to Chord nodes [9]. Because each node receives approximately the equal number of keys, consistent hashing performs load balancing and needs comparatively less reallocation of keys when nodes join and leave the system [9]. +Chord protocol addresses the fundamental issues authoritatively when a node joins or leaves a cluster including load balancing, scalability, and availability. It achieves load balancing by acting as a distributed hash function and assigning keys uniformly over the nodes. Chord is highly scalable and can work for very large cluster of nodes. Hence, Chord lookup cost increases with the log of the number of nodes. Additionally, no additional parameter tuning is necessary to accomplish this scaling. Chord achieves high availability by automatically adjusting its internal tables as the new nodes join and leave (due to failure, and/or maintenance etc.) the cluster. This ensures a highly fault tolerant cluster of nodes. Hence, the node responsible for a key can always be found irrespective of nodes joining or leaving a cluster or even if the system is in continuous state of change [9]. +The fail safe nature of Chord software takes the form of a library to be connected with the applications using it. The system containing the application interacts with the Chord in two folds. Initially, the application receives a function from the Chord library that contains the IP address of the node responsible for the key. Lastly, the application with modifications in the sets of keys for each responsible nodes gets notified from the Chord software on each node. For instance, this update enables the application software to shift the respective values to their new location nodes, when a new node joins or leaves the cluster [9]. +The required authentication, caching, duplication, and user-friendly naming of data are provided by the application using Chord. The flat key-space feature of the Chord simplifies these requirements implementation. For instance, for data authentication, an application may store the data using a Chord key that is derived using a cryptographic hash of this data. Likewise, the application may also achieve data replication by storing it using two separate Chord keys resulting from the data’s application-level identifier. Other instances where Chord can provide a good basis include Cooperative Mirroring, and Time-Shared Storage [9]. +During Cooperative Mirroring, several content providers work together to store and serve each other’s data. These nodes could be a set of software development projects, everyone making a periodic release. Distributing the accumulative work load uniformly over all nodes reduces the aggregate cost of the cluster, because every node will be required to deliver the capacity for the average load only and not the node’s highest load. Dabek et al. [12] explains an implementation of this concept that uses Chord to map data blocks onto servers. Chord interacts with the application to load balance, replicate data, and latency-based server selection [9]. +In Time-Shared Storage, the nodes, to achieve high availability, but have sporadic connectivity may offer to store other node’s data while they are connected, in return for having their data stored elsewhere when they are disconnected. To detect the Chord node which is alive and is acting as the data store at some point in time, the data’s name can be used as a key for it. Several similar problems occur just as in the cooperative mirroring application; however the goal here is to achieve high availability rather than load balancing [9]. +VI. PROPOSED ARCHITECTURE +Our aim is to enhance the HDFS architecture so that HDFS NameNode is highly available and scalable. This is 158 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +accomplished by integrating the Chord protocol into the existing HDFS NameNode architecture. Our proposed architecture is referred to as NameNode Clustered Using Chord (NUCU). +A. NCUC Architecture +With single HDFS NameNode in the existing architecture, clients request the single NameNode machine for file metadata or file modifications and perform the actual file I/O directly with the Datanodes [6]. We will refer to these client requests as resource requests. Each resource request will be hashed into a key using a consistent hashing algorithm to form a resource request query (RRQ). The NameNode responds to a client resource request with resource request reply (RRP). +NCUC architecture is simple. The RRQ will be passed to NCUC black-box, which would provide the client with the RRP. The clients, upon receiving the RRP, would contact the respective Datanodes, specified in RRP, to perform the desired I/O requests for the data chunks stored in Datanodes. This workflow is explained in Figure 2. NCUC, using consistent hashing, maps the keys to NCUC NameNodes in the following way. Identifiers are assigned on an NCUC identifier NameNode ring modulo 2k. The first node, whose identifier is equal to or tails the identifier of z in the identifier space, is mapped a key z. +B. Inside the NCUC Black-box: +The NCUC uses consistent hash function to map each NameNode and its key with a k-bit identifier using SHA-1 [16] as a base hash function. A NameNode’s identifier is selected by hashing the NameNode’s static IP address, whereas a key identifier is constructed by hashing the key. It should be noted that the identifier length k should be sufficiently large to avoid the probability of having one NameNode or key hashing to the other already assigned key or NameNode identifier. +Figure 3: NCUC black-box containing ten NameNodes storing five keys +This NameNode is referred as the successor NameNode of key z, or succ(z). Assuming that the identifiers are indicated as a circle of numbers from 0 to 2m — 1, then the succ(z) will be the first NameNode clockwise from z. This can be comprehended using the following example. Figure3, shows a NCUC ring with k = 6. The NCUC ring, storing five keys, has ten NameNodes. Here, the successor of identifier 10, succ(10), is NameNode 14, so key 10 would be located at NameNode 14. Likewise, keys 24 and 30 would be located at NameNode 32, key 38 at NameNode 38, and key 54 located at NameNode 56. +NCUC using consistent hashing is intended to enable NameNodes join and exit the NCUC cluster with little interruption. Some keys formerly allocated to n’s successor now would be reassigned to n in order to preserve the NCUC consistent hashing mapping when a NameNode n enters the NCUC clustered ring. All of NameNode n’s allocated keys will be reallocated to n’s successor upon n’s departure from the NCUC clustered ring. No further changes are required in the keys allocation to NCUC NameNodes. In Figure 3, considering that a NameNode were to enter the NCUC ring with identifier 26, it would get the key with identifier 24 from the NameNode with identifier 32. +NCUC offers quick distributed calculation of a hash function, assigning keys to NameNodes. NCUC maps keys to NameNodes using consistent hashing [14, 15]. The NCUC hash function, quite efficiently, does load balancing by allocating all NameNodes approximately equal number of keys and after an Nth NameNode enters or departs the NCUC cluster, merely O(1/N) portion of the keys are forwarded to the next relevant NameNode. +For a NCUC cluster with N NameNodes, each NameNode will preserve routing table merely regarding O(logN) other +Figure 2: Workflow of NCUC Architecture +BEGIN +CLIENT RESOURCE REQUESTS – HASH INTO RRQ +NCUC Black-box +NCUC RESPONSE – RRP – CLIENT ACKNOWLEDGES +CLIENT CONTACTS DATANODES +END +N51 +N56 +N1 +N8 +N14 +N21 +N32 +N38 +N42 +N48 +Z10 +Z34 +Z24 +Z38 +Z54 159 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +NameNodes, while the lookup will need O(log N) messages. This is achieved due to the usage of the Chord protocol [9]. +It is arguable that Chord integration into HDFS NameNode architecture impacts the simplicity of a single HDFS NameNode architecture, which is one of the primary goals of a simple HDFS architecture. However, the single point-of-failure HDFS NameNode and its RAM limitation to keep all the files metadata, stored in the Datanodes, required an alternative solution. Chord integration into HDFS NameNode provides a reliable and efficient solution to this problem. +VII. PERFORMANCE ANALYSIS +First, we developed Chord and validated our Chord implementation by reproducing all the experiments’ results conducted in [9]. Second, we performed a set of experiments to evaluate our proposed HDFS architecture. In evaluating our proposed HDFS architecture, we used 2 Linux Amazon Cloud EC2 nodes after installing Hadoop on these machines. We then configured and tested the current HDFS architecture as well as our proposed HDSF architecture by performing two operations, namely READ and WRITE. +Table 1 shows the results of writing and reading a single file of size 512 MB using the current HDFS architecture. On the other hand, Table 2 shows the results of writing and reading a single file of size 512 MB using our proposed HDFS architecture. +TABLE I. W/R RESULTS USING CURRENT HDFS ARCHITECTURE. 512 MB, Blocksize=64 MB, nrFiles=5, Replication=1 +Op. +Performance +Metrics +Exp.#1 +Exp. +#2 +Exp. +#3 +Ave. +W +Throughput (mb/s) +129 +134 +139 +134 +Ave. I/O rate (mb/s) +129 +134 +139 +134 +I/O rate std deviation +0 +0 +0 +0 +R +Throughput (mb/s) +150 +146 +152 +150 +Ave. I/O rate (mb/s) +151 +147 +153 +150 +I/O rate std deviation +13 +5 +13 +10 +TABLE II. W/R RESULTS USING PROPOSED HDFS ARCHITECTURE. +512 MB, Blocksize=64 MB, nrFiles=5, Replication=1 +Op. +Performance +Metrics +Exp.#1 +Exp. +#2 +Exp. +#3 +Ave. +W +Throughput (mb/s) +156 +151 +154 +154 +Ave. I/O rate (mb/s) +152 +149 +150 +150 +I/O rate std deviation +0 +0 +0 +0 +R +Throughput (mb/s) +180 +180 +174 +178 +Ave. I/O rate (mb/s) +176 +176 +176 +176 +I/O rate std deviation +0 +0 +0 +0 +It can be inferred from these results that our proposed architecture performed better in terms of throughput and in terms of I/O rate. +VIII. FUTURE WORK AND CONCLUSIONS +Cloud computing enable companies outsource IaaS, PaaS and SaaS on-demand to reduce costs. One of the areas cloud computing is increasingly being utilized for is large scale data processing. Apache Hadoop is one such attempt to support data-intensive distributed applications. Hadoop applications utilize a primary distributed file system for data storage called Hadoop Distributed File System (HDFS). HDFS Datanodes’ metadata is restricted by the capacity of the RAM of the HDFS’s single-point-of-failure NameNode. This paper proposes a fault tolerant, highly available and widely scalable HDFS architecture whose NameNode is distributed and will not suffer HDFS failure in case of a single NameNode failure. We achieved this by utilizing the Chord protocol and integrate it with HDFS NameNode. Not only a little complexity is introduced into the HDFS NameNode by this approach, our proposed architecture will highly improve the availability and scalability of HDFS architecture including its single-point-of-failure. As a future work, we are planning to conduct extensive experiments and build a prototype as a result of our future extensive evaluation process. +ACKNOWLEDGMENT +The author acknowledges the support provided by King Fahd University of Petroleum and Minerals (KFUPM). This project is funded by King Abdulaziz City for Science and Technology (KACST) under the National Science, Technology, and Innovation Plan (project number 11-INF1657-04). +REFERENCES +[1] Michael Armbrust et al. Above the Clouds: A Berkeley View of Cloud Computing. Technical report EECS-2009-28, UC Berkeley, http://www.eecs.berkeley.edu/Pubs/TechRpts/2009/EECS-2009-28.html, Feb. 2009. +[2] Amazon web services economics center. http://aws.amazon.com/economics/ . +[3] http://en.wikipedia.org/wiki/Hadoop +[4] http://hadoop.apache.org/#What+Is+Hadoop%3F +[5] http://hadoop.apache.org/hdfs/ +[6] http://hadoop.apache.org/hdfs/docs/current/hdfs_user_guide.html +[7] http://hadoop.apache.org/hdfs/docs/current/hdfs_design.html +[8] http://en.wikipedia.org/wiki/MapReduce +[9] Ion Stoica , Robert Morris , David Liben-Nowell, David Karger , M. Frans Kaashoek , Frank Dabek, and Hari Balakrishnan, ―Chord: a scalable peer-to-peer lookup protocol for internet applications‖, IEEE/ACM TRANSACTIONS ON NETWORKING, Vols. 11, No. 1, pp. 17-32, Feb. 2003. +[10] D. R. Karger, E. Lehman, F. Leighton, M. Levine, D. Lewin, and R. Panigrahy, ―Consistent hashing and random trees: Distributed caching protocols for relieving hot spots on the WorldWideWeb,‖ in Proc. 29th Annu. ACM Symp. Theory of Computing, El Paso, TX, May 1997, pp. 654–663. +[11] R. Cox, A. Muthitacharoen, and R. Morris, ―Serving DNS using Chord,‖ in Proc. 1st Int. Workshop Peer-to-Peer Systems , Cambridge, MA, Mar. 2002. +[12] F. Dabek, F. Kaashoek, D. R. Karger, R. Morris, and I. Stoica, ―Wide-area cooperative storage with CFS,‖ in Proc. ACM Symp. Operating Systems Principles, Banff, Canada, 2001, pp. 202–215. +[13] D. Liben-Nowell, H. Balakrishnan, and D. R. Karger, ―Analysis of the evolution of peer-to-peer systems,‖ in Proc. 21st ACM Symp. Principles of Distributed Computing (PODC), Monterey, CA, July 2002, pp. 233–242. +[14] D. R. Karger, E. Lehman, F. Leighton, M. Levine, D. Lewin, and R. Panigrahy, ―Consistent hashing and random trees: Distributed caching 160 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. +protocols for relieving hot spots on theWorldWideWeb,‖ in Proc. 29th Annu. ACM Symp. Theory of Computing, El Paso, TX, May 1997, pp. 654–663. +[15] D. Lewin, ―Consistent hashing and random trees: Algorithms for caching in distributed networks,‖ Master’s thesis, Department of Electric. Eng. Comput. Sci., Massachusetts Inst. Technol., Cambridge, 1998. +[16] ―Secure Hash Standard,‖ U.S. Dept. Commerce/NIST, National Technical Information Service, Springfield, VA, FIPS 180-1, Apr. 1995 +[17] Zoltán Lajos Kis, Róbert Szabó, ―Interconnected Chord-rings‖, Network Protocols and Algorithms, Vol. 2, No. 2, 2010. +161 +Authorized licensed use limited to: BEIHANG UNIVERSITY. Downloaded on September 12,2021 at 03:53:07 UTC from IEEE Xplore. Restrictions apply. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..72f6ee212878baefb8a042b43566089555c9e20b Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop HDFS/Towards A Scalable HDFS Architecture.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt new file mode 100644 index 0000000000000000000000000000000000000000..35baf04f2ad62a91e0807d6c16c81b2a986baf60 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS-ziyan.txt @@ -0,0 +1,48 @@ +Input , Input +Text Input Format , Text Input Format +Sequence File Input Format , Sequence File Input Format +Combine File Input Format , Combine File Input Format +Key Value Text Input Format , Key Value Text Input Format +Key Value Text Input Format , Key Filter +Key Value Text Input Format , Key questions +Fixed Length Input Format , Fixed Length Input Format +NLine Input Format , NLine Input Format +Combine File Record Reader , Combine File Record Reader +Key Value Line Record Reader , Key Value Line Record Reader +Key Value Line Record Reader , Key questions +Sequence File Record Reader , Sequence File Record Reader +DBRecord Reader , DBRecord Reader +Map , Map +Inverse Mapper , Inverse Mapper +Multithreaded Mapper , Multithreaded Mapper +Regex Mapper , Regex Mapper +Token Counter Mapper , Token Counter Mapper +Partition , Partition +Binary Partitioner , Binary Partitioner +Hash Partitioner , Hash Partitioner +Hash Partitioner , default Partitioner +Key Field Based Partitioner , Key Field Based Partitioner +Key Field Based Partitioner , Key idea +Rehash Partitioner , Rehash Partitioner +Total Order Partitioner , Total Order Partitioner +Reduce , Reduce +Int Sum Reducer , Int Sum Reducer +Int Sum Reducer , Reducer interfaces +Int Sum Reducer , Reducer Factory +Int Sum Reducer , Reducer aggregate +Int Sum Reducer , Reducer Phase +Int Sum Reducer , Reducer implementations +Long Sum Reducer , Long Sum Reducer +Long Sum Reducer , Reducer interfaces +Long Sum Reducer , Reducer Factory +Long Sum Reducer , Reducer aggregate +Long Sum Reducer , Reducer Phase +Long Sum Reducer , Reducer implementations +Output , Output +Map File Output Format , Map File Output Format +Map File Output Format , method Map +Map File Output Format , Map Reduce papers +Map File Output Format , Map Task +Map File Output Format , Facebook Map +Sequence File Output Format , Sequence File Output Format +Text Output Format , Text Output Format \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9490149a471f323efc9f15d4325c58cb4176a0d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt @@ -0,0 +1,898 @@ +See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/305489358 +A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH +APPLICATION TO INTERNET OF THINGS +Article in Ciência e Técnica Vitivinícola · July 2016 +CITATIONS +0 +READS +2,529 +3 authors, including: +Some of the authors of this publication are also working on these related projects: +Landmines detection using mobile robots View project +Early Detection for Alzheimer's Disease View project +Mohammed Elmogy +Mansoura University +227 PUBLICATIONS 1,801 CITATIONS +SEE PROFILE +All content following this page was uploaded by Mohammed Elmogy on 22 July 2016. +The user has requested enhancement of the downloaded file. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +2 +A BIG DATA PROCESSING FRAMEWORK +BASED ON MAPREDUCE WITH APPLICATION +TO INTERNET OF THINGS +1Heba, A., 2 Mohammed, E., 3 Shereif, B. +1 Information Systems Dept., Faculty of Computers and Information, +Mansoura University Mansoura, Egypt, hebaaly92@gmail.com +*2 Information Technology Dept., Faculty of Computers and Information, +Mansoura University, Mansoura, Egypt, melmogy@mans.edu.eg +3 Information Systems Dept., Faculty of Computers and Information, +Mansoura University, Mansoura, Egypt, sherifiib@yahoo.com +ABSTRACT +Massive and various data from the Internet of Things (IoT) generate enormous storage challenges. The +IoT applications caused an extensive development. In the past two decades, the expansion of +computational asset had a significant effect on the flow of the data. The vast flow of data is identified as +"Big data," which is the data that cannot be managed using current ordinary techniques or tools. If it is +correctly handled, it generates interesting information, such as investigating the user's behavior and +business intelligence. +In this paper, the proposed system is implemented to handle massive data with all forms of data +resources whether structured, semi-structured, and non-structured altogether. The results and discussion +show that the proposed system generates a feasible solution in applying big data IoT-based smart +applications. In the data preprocessing stage, we used the K-nearest neighbors (KNN) technique to clean +noisy data and a Singular Value Decomposition (SVD) to reduce data dimensionality. In the processing +stage, we proposed a hybrid technique of a Fuzzy C-mean and Density-based spatial clustering (FCMDBSCAN) +to deal with the applications with noise. The clustering technique is implemented on +MapReduce model. MapReduce is represented as the most admitted framework to operate processing on +big data. The MapReduce is the most principle model to deal with big data. The used technique is +providing scalability, rapidity, and well-fitting accuracy for storing big data. In addition, it is obtaining +meaningful information from huge datasets that give great vision to make effective outcomes using fast +and efficient processing platform. Experimental results show that the accuracy of the proposed +framework is 98.9% using IADL activities dataset. +KEYWORDS: Internet of Things (IoT); Big data; Singular Value Decomposition (SVD); FCMDBSCAN; +MapReduce. +1. INTRODUCTION +The IoT is the connection that joins items to the Internet over varieties of view +information devices. Therefore, all objects that can be addressed separately can +interchange information among each other, and eventually realize the aims of +perspective recognition, location, tracking, supervision, and administration [1]. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +3 +Figure 1. The Big data in IoT. +The essential thought of IoT is to interface all things on the planet to the Web. +It is normal that things can be recognized automatically, can speak with each +other, and also can even settle on choices without human interference [2]. Figure +1 shows the relationship between IoT and big data and how the sensor data +represented as big data. +Data are a standout amongst the most important parts of the IoT. In the nature +of the IoT, data are gathered from various types of sensors and speak to billions +of objects. In all considered things, the data on the IoT display the next +challenges: +- The massive scale of the IoT: It includes a huge number of discernment +devices. These devices are consistently and consequently gathering data, +which prompt a quick development of information scale. +- Different of observation gadgets: They inspect varied resources and +heterogeneity of the IoT data. The gathered data from distinctive devices +and measures have different semantics and structures. +- Interoperability: It indicates the way that the vast majority of the IoT +applications are currently secluded. In the long run, the IoT will need to +accomplish data distribution to encourage communitarian among diverse +applications. Taking telemedicine benefit as an instance, once a patient is +in crisis, the movement data is likewise expected to evaluate the landing +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +4 +interval of the rescue vehicle to choose what sort of assistant medical +strategy to take. +- Multi-dimensionality: It is considered as the principal issue in the +applications of the IoT. For the most part, it incorporates a few sensors to +all the while screen various pointers, such as temperature, dampness, light, +and weight. Along these lines, the specimen information is typically +multidimensional [1]. +Data are extensive in volume, so they are asserted in a mixed bag or moved +with such speed, which are called "Big data." It is not a thing; it is a thought or +ideal model that characterized the expanding, gathering, and utilization of huge +measures of dissimilar information. Big data is helping in choice making and +taking the business to a different universe [3]. +Big data started to be the point of view when the standard database frameworks +were not prepared to handle the unstructured data, such as weblogs, features, +photographs, social overhauls, and human conduct. They are produced by online +networking, sensor devices, or from some other data creating sources. +Figure 2. The Big data 4Vs and data sequence. +Figure 2 observes the big data 4Vs that includes volume, velocity, variety, and +veracity. It also describes the big data sequence. Some issues and technologies +are identified with the accessibility of greatly substantial volumes of data that +organizations need to join and get. There is a significant venture for a time, cash, +and assets that are expected to make this style of processing ordinary. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +5 +The rest of this paper is structured as follows. Section 2 shows some basic +concepts. Section 3 explains the current related work. Section 4 shows the +proposed system and explains each phase in detail. In Section 5, the +implementation results of the proposed techniques are discussed on a benchmark +dataset. Finally, the conclusion and future work are presented in Section 6. +2. BASIC CONCEPTS +2.1 MAPREDUCE +The Big data analytics society has admitted MapReduce as a programming +template for handling massive data on separated systems. MapReduce model +has become one of the perfect choices of the programming paradigm for +processing massive datasets. It is a paradigm for evolving a distributed +clarification for complicated difficulties over enormous datasets [4]. Users +identify a map function that handles a pair of key-value to produce a group of +intermediate key-value sets. In addition, It creates a reduce function that joins +all intermediate values related with the same intermediate key. The MapReduce +architecture is shown in Figure 3. +Figure 3. The MapReduce architecture. + MapReduce Algorithm +There are four steps to implement MapReduce framework, which includes reading a +large dataset, implementing the map function, implementing the reduce function, and +returning the resulting data from the map and reduce. The mapper receives masses of +data and produces intermediate results. The reducer reads the intermediate results and +emits a final result. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +6 +A. Read Large Dataset +Figure 4. The block diagram of MapReduce read data. +As shown in figure 4, we create a data store using the dataset with CSV extension. The +data store displays a tabular datastore object for the data. Then, we select specific +variables' names from the dataset. The selected variables' names feature permits +working with the specified variables of the user's needs. The user can use preview +command to retrieve the data. +B. Generic Map Function +Figure 5 shows the generic map function, which is considered as a general function for +any key and value. This function enables the coder to set any pair of key-value for the +selected dataset. We set intermediate key and intermediate value. Then, we subset the +dataset at this specific value. Finally, we obtain set of key-value stored in the keyvalue +store. +Insert large dataset +Initialize datastore variable to +store large dataset. +Select specific variables' names +from the dataset. +Add selected variables to +datastore. +Preview large dataset. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +7 +Figure 5. The block diagram of generic map function. +C. Map Function +Data Key-value +store +Subset +term +Generic map function +Initialize intermediate key and intermediate +value +Subset data at specific value +Set intermediate key-value store +Adding set of intermediate keys and +intermediate values to intermediate keyvalue +store. +Data Intermediate +key-value +store +Map function receives data and specific value +Set the condition of key-value pair +Create output store to store all partitions of data that +satisfy the key-value condition. +Store all the results in output key-value store. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +8 +Figure 6. The block diagram of map function. +Figure 6 illustrates the Map function that gets a table with the variables labeled by +the selected variables' names property in the data store. Then, the Map function +extracts a subset of the dataset that verifies the condition value of the selected key. +D. Reduce Function +Figure7 shows the Reduce function that receives the subsetted results gained from +the Map function and simply merge them into a single table. The Reduce returns one +key and one value. +Figure7. The block diagram of reduce function. +2.2 DBSCAN ALGORITHM +DBSCAN [5] is a clustering technique that depends on density. The thought is that if a +specific point fits in with a cluster, it ought to be close to loads of different points in +that cluster. The DBSCAN algorithm works as follows. First, two parameters are +picked, a positive number Epsilon and a characteristic number minPoints. Then, start +by picking a subjective point in the dataset. If there are more than minPoints points +inside of a separation of Epsilon starting there, we consider every one of them to be a +piece of a "cluster." Then, we extend that cluster by checking the greater part of the +new points and checking whether they too have more than minPoints points inside of a +Intermediate +value +Key-value +store +Create Reduce function +Initialize output value variable. +Get all intermediate results +While has next results, add intermediate values to the output value. +Adding all output values to output key-value store. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +9 +separation of epsilon. In the end, points are come out to be added to the cluster. After +that, pick a new arbitrary point and repeat the process. Presently, it is entirely probable +that the picked point is less than minPoints points in its Epsilon, and it is not a part of +any other group. Therefore, it is viewed as a "noise point" that is not fitting in any +group. The DBSCAN pseudo code is listed as follows: +DBSCAN steps are as follows: +1. Design a graph whose items are the points to be clustered +2. For each core-point c make an edge from c to every point p in the - +neighborhood of c +3. Set N to the items of the graph; +4. If N does not include any core points terminate +5. Pick a core point c in N +6. Let X be the set of nodes that can be reached from c by going forward; +1. create a cluster containing X{c} +2. N=N/(X{c}) +7. Continue with step 4 +2.3 FCM ALGORITHM +FCM [6,7] is a data clustering procedure. The dataset is categorized to n clusters. +Every data point in the dataset related to a cluster, which has a high level of +relationship with that cluster. Another data point that remotely lies from the center of a +cluster has a low level of association with that cluster. This technique is often utilized +in pattern recognition. It depends on minimization of the objective function. The +algorithmic steps for Fuzzy C-Means clustering is as follows: +First, calculate the center of the clusters using the following equation [6]: +Cj = ΣN +i=1 (Mm +ij * xi )/ Mm ij (1) +Then, the objective function is calculated based on the membership matrix by the +following calculation: +Jm=ΣN +i=1 ΣC +j=1 Mm +ij ||xi – cj ||2 (2) +Finally, the membership value is updated by: +M ij= 1/(Σ(||xi – cj|| / || xi - ck||))2/(m-1) (3) +where m is a real number greater than 1, Mij is the degree of membership of xi in the +cluster j, xi is the ith of d-dimensional measured data, cj is the d-dimension center of the +cluster, and ||*|| is the similarity measure between any measured data and the center. +FCM sequentially moves the cluster centers to the right area inside a dataset. FCM +clustering strategies rely on fuzzy behavior, and they give a method that is normal to +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +10 +produce a clustering where membership weights have a characteristic translation but +not probabilistic at all. +2.4 K- NEAREST NEIGHBORS (KNN) +In KNN regression, the outcome is the estimation of the item. This value is normal of +the estimations of its k-closest neighbors. Hence testing the performance should be +appropriate. The KNN computes the Euclidean distance from the query example to the +labeled examples using the following equation [8]. +D=√ (4) +Selecting the ideal value for K is best done by first reviewing the data. A large K value +is more accurate as it reduces the overall noise. Then, labeled examples are ordered by +the highest distance and find a heuristically top K-number of adjacent neighbors. +Finally, search the data for the most likely instance. It does not lose any detail and +compares every training sample to give the prediction. +2.5 SINGULAR VALUE DECOMPOSITION +SVD receives a rectangular matrix of the data that are defined as A, where A is an n x +p matrix, which the n rows represents the data and the p columns represent the +experimental properties. The SVD theorem states that [9]: +Anxp= Unxn Snxp VT +pxp (5) +Where +UTU = Inxn (6) +VTV = Ipxp (i.e. U and V are orthogonal) (7) +where U has columns that are the left singular vectors, S is the same dimensions as A +that contains singular values, and VT has rows that are the right singular vectors. The +SVD represents an outline of the original data in a coordinate system where the matrix +is diagonal. The SVD calculated by the equation: +W = AAT (8) +Wx=ƛx (9) +The scalar  is called an eigenvalue of A, and x is an eigenvector of A relating to . +The computation of the SVD consists of finding the eigenvalues and eigenvectors of +AAT or ATA. The eigenvectors of ATA consist the columns of V, the eigenvectors of +AAT represent the columns of U. Also, the singular values in S are square roots of +eigenvalues from AAT or ATA. The singular values are the diagonal entries of the S +matrix and are arranged in descending order. The singular values are always real +numbers. If the matrix A is a real matrix, then U and V are also real. The SVD feature +specifies the nearest rank-l estimation for a matrix. By putting the little singular values +to zero, we can acquire matrix estimations whose rank meets the number of +outstanding singular values. +3. RELATED WORK +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +11 +Big data from IoT is considered as an important research topic. Many researchers are +working in this field. For example, Tao and Ji [10] utilized the MapReduce technique to +investigate the various little datasets. They proposed procedure for monstrous little data +in light of the K-means clustering calculation. Their outcomes established that the +suggested manner may enhance the information preparing proficiency. They use Kmeans +calculation for data examination in view of MapReduce. Then, they utilized a +record for the converging of data inside the cluster. The data in the same square have a +high likeness when the merger is finished. The exploration will help them to plan a +merger technique of little information in IoT. The DBSCAN algorithm can be suitable +to be applied on big data because the number of clusters is not needed to be known in +the beginning. +Xu and Xun [11] outlined MapReduce model of distributed computing. In the +instrument of MapReduce, they consolidated the structural planning attributes and key +innovation of IoT. They led conveyed mining on information and data in the IoT world. +Also, they represent stream information distribution. In a customary way for mining +valuable information from raw data created by IoT, analyze deficiencies of the +conventional Apriori calculation. Apriori has a lower mining proficiency and consumes +up mass room in memory. The mining technique for monstrous information in the IoT +involves stream information investigation, grouping and so on. They plan to propose a +system for handling Big data with a low charge and apply security of information. The +proposed system has a low effectiveness, so it should be moved forward. +Wang et al. [12] investigated structural planning of the IoT in agribusiness that gives +distributed processing and its usage. The execution planned on a two-tier construction +using HBase. The structural planning gives constant read or access to the enormous +sensor information. In addition, backing the sensor data executed by MapReduce +model. XML documents put standards for the framework to bind the organizations of +heterogeneous sensor data. Utilizing this framework lead the framework to lack of a +variety of sensor data. +Gole and Tidk [13] proposed a ClustBigFIM method, which is based on MapReduce +structure for mining large datasets. ClustBigFIM is an improvement of BigFIM +algorithm that is offering velocity to obtain information from massive datasets. They +are relying on the manner of associations, sequential patterns, correlations, and other +data mining missions that give good vision. MapReduce stage is utilized widely for +mining big data from online networking as convention device and systems. It aims to +employ frequent item to set mining calculation and MapReduce system on a flow of +information. It can be consistent experiences in Big data. +Li et al. [1] suggested a storage managing clarification relied on NoSQL, which is +called IOTMDB. They offered a storing managing clarification to handle the massive +and heterogeneous IoT data. The IOTMDB is not only mattered about how to save the +massive IoT data successfully but also to concern for data distribution. The IoT data +storing tactics are applied to incorporate a preprocessing procedure to cover the public +and precise requirements. Their future work will be a model oriented to IOTMDB that +will rely on NoSQL. In addition, they will handle and investigate the massive IoT data +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +12 +to expand its value. Applying a reduction algorithm in the preprocessing step will +improve the accuracy and save time. +Mesiti and Valtolina [14] proposed a structure that was ready to assemble information +from distinctive sources with diverse structures like JSON, XML, literary, and +information gushing from sensors. The information accumulations led the database to +be unstructured and oblige information joining. As the world moves to grow Big +information investigation strategies, they came to an answer that can be loading +information from heterogeneous sensors then incorporate that heterogeneous sensor +information utilizing NoSQL frameworks. They outlined an easy to use loading +framework by deciding an arrangement to choose fitting NoSQL framework that +permits reasonable mapping to be conveyed. +Zhan et al. [15] designed a massive data processing model in the cloud. Their model +can be used to handle all types of data resources, which can be structured, semistructured, +and non-structured. They concentrated on two main points. First, they +outlined the CloudFS that depends on the open sources project Hadoop. Second, they +implemented Cloud Manager DB that is constructed on the open sources project HBase, +MongoDB. Finally, they did not provide any method to deal with the varieties of the +data. +Galache et al. [16] displayed the ClouT extend, which is a joint European-Japanese +venture. Their main issue is making nations mindful of city resources. In addition, they +talk care of these resources by a set of smart IoT services in the Cloud. The proposed +framework based on a three-layer architecture, which is composed of CIaaS, CPaaS, +and CSaaS layers. They developed different four use cases associated with different +applications within four cities. These assets utilized and considered by effective IoT +benefits in the Cloud. +Sowe et al. [17] proposed an answer for massive heterogeneous sensor information +issue. They obliged to make a join between distinctive types of information. This issue +is an incorporated IoT structural planning. It consolidates a Service-Controlled +Networking (SCN) as a key middleware to oversee heterogeneous information +accumulated from sensors on a Big data cloud stage. The proposed model is connected +to accumulate, share information, and control IoT social orders. It allows the client to +investigate, find, and use the sensor information. They utilized the User Defined +Harvester (UDH) advancements notwithstanding SCN to expand the included detection. +In this paper, the portable detecting information is not accessible. They ought to execute +the structure that can treat with this detection information. +Cecchinel et al. [18] proposed a programming structure that ready to support big data +examination work. This structure is the utilization measure of datasets that originate +from physical sensors. These datasets originate from SMARTCAMPUS venture. Their +structural engineering can understand genuine prerequisites from the +SMARTCAMPUS venture. As a result, the work done in this structural planning relies +on information from social event and capacity, i.e. the discriminating way of Big data +accumulation stage by utilizing middleware structural engineering. They plan to create +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +13 +a programming model that empowers every client to make its applications. On the off +chance that they include new information, the system programming can be down. +Therefore, the system adaptability ought to be improved. +Mishra et al. [19] proposed a Cognitive-Oriented IoT Big-data framework (COIB) for +the valuable data administration and knowledge detection over the IoT Big data. They +built a general IoT Big data layered design by the usage of the COIB system in the huge +scale mechanical computerization environment. They suggested in their future work to +incorporate mining and examination huge information that is produced by trillions of +IoT items +In this paper, our proposed system offers a solution for storing and retrieving IoT +Big data and improves the accuracy of the resulting data. The proposed system can +store and retrieve a massive number of data in small time. First, we clean noise from +data. Then, we use Kennard sample and SVD as a data reduction techniques to reduce +big data from IoT without losing any data. Also, we use the mutual information +algorithm to detect relationships between attributes and predict the semantic clusters. +Finally, we use MapReduce based on FCM-DBSCAN for data clustering for the vast +store and retrieve of data. +4. THE PROPOSED SYSTEM +Figure 8. The proposed system of the massive–heterogeneous sensor data. +Variety of sensors +Raw data +Data Cleaning +Data Integration +Data Processing (clustering) +Storage +Data Reduction +Storage +Homogenous +data +Dimensional +reduced data +Data with little +size +Cleaned, +noiseless data +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +14 +The proposed system consists of two main phases: data preprocessing and data +processing phases, as shown in figure 8 In the preprocessing phase, the first stage is +data collection stage. In this stage, the dataset collected from different sensors. The +second stage is data cleaning based on outlier detection and noise removal as it is easy +to implement. The third stage is data reduction using SVD algorithm to reduce the +dimensionality of the data and reduce the execution time of data processing then +utilization of Kennard sampling to select a random sample from dataset aiming to save +the running time. The last stage is data integration based on correlation and mutual +information aiming to determine the relationship between attributes and detect semantic +clusters. In the processing phase, data is clustered using FCM-DBSCAN based on +MapReduce as it is a standard programming model for data distribution to improve the +performance of big data in a vast time. In the following subsections, the main stages of +these two phases will be discussed in details. +a. Data Preprocessing Phase: +The preprocessing is a basic phase in data science because it qualifies the choices to be +originated from the qualified data. Data preprocessing is a data mining method that +includes changing raw data into a reasonable information. Genuine information is +frequently inadequate, conflicting, leaking in specific practices, and liable to include +numerous mistakes. Data preprocessing is a demonstrated strategy for determining such +issues. It utilizes database-driven applications, such as associations and standard +established applications [20]. The applied data preprocessing steps are data cleaning, +data reduction, and data integration. These steps are discussed in detail in the +following subsections. +a) Data Cleaning: +The procedure of cleaning the data is not easy. The confusion may reach to more +than 30% of real information that could be grimy. In addition, it has exceptionally cost +[21]. Data can be cleaned based on procedures, such as filling in missing values, +smoothing the noisy data, or solving the inconsistencies in the data. Several ways have +been used to deal with missing data, such as [22]: + Deletion: It removes the missing data and using the rest of the data in the +analysis. This deletion can be inefficient as it decreases dataset size and may +delete valuable data. + Imputation: It tries to fill in the missing values with the help of many techniques, +such as: +o Mean/Mode: It fills the missing data by using the mean of a numeric +attribute or mode for a nominal attribute of all data. +o K-Nearest Neighbor Imputation (KNN): It uses KNN algorithms to fill +the missing data. It can deal with discrete and continuous attributes. KNN +searches all the data to find the most similar instances. It can choose the +most probable value from the dataset. +We suggest KNN algorithm for data cleaning. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +15 +Figure 9. The block diagram of the data cleaning steps. +Figure 9 shows that the input data has many challenges as noisy data and outliers. +First, the data is de-duplicated to remove the repetition of the data. Then, the outlier is +detected and excluded from data. The data is filtered to choose the principle attributes +to represent data. Finally, missing values are replaced by the most probable value +depending on KNN regression. +b) Data Reduction +A monstrous measure of information is progressively available from different +sources, for example, logistics insights, cameras, receivers, RFIDs, scanner tag +information, remote sensor systems, and account logs from R&D [23]. Highdimensional +information gets extraordinary difficulties terms of computational manysided +quality and characterization execution. Along these lines, it is important to +obtain a low-dimensional component space from high dimensional component space to +outline a learner with great execution [24]. +Figure 10. The block diagram for the data reduction steps. +Figure 10 shows that the cleaned data is the input for data reduction stage. Data +reduction separated to numericity reduction and dimensionality reduction. The data +numericity reduction can be applied using regression or sampling. The used sampling +algorithm is Kennard sample. Kennard sample reduces the number of iterations by +viewing a list of the highest smallest distances that aims to save time. The data +dimensionality reduction can be applied using many algorithms as PCA, SOM, and +SVD algorithm. We proposed to use SVD for dimensionality reduction. It is suitable +for reducing the dimensionality of large dimensional data. We compare SVD +Input data De-duplication Detect outlier +Replace missing values Filtering +Input data +(Cleaned and Noiseless data) +Numericity Reduction +Dimensionality Reduction +Sampling +Singular Value Decomposition +Reduced data +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +16 +algorithm with other algorithms as PCA, SOM, ICA, PCA (Kernel). We conclude that +the SVD algorithm operates in small time than other algorithms. +c) Data Integration +Data with diverse forms are put together and opposes with each other. Data integration +is a successful way to deal with combined data that lives in various sources and gives +brought together for the purpose of access to the end users [25]. Big data presents +organizations with huge volume and multifaceted nature. It comes in diverse +structures–organized, semi-organized, and unorganized– and from any number of +different sources. From this immense, various sorts of information are continuously +developed. Therefore, the organizations should concentrate on speedy, exact, and +significant bits of knowledge [26]. +The proposed algorithm is the mutual information that can able to deal with numeric +data. Mutual information detects the relationship between the attributes and also +detects the semantic clusters. The equation of the mutual information is as follows +[27]: +MI=Σx,y P(X,Y)log2[P(X,Y)/P(X)P(Y)] (10) +where X and Y are the two dimensions of the dataset. +b. Data Processing Phase +Data processing phase is the control of the information processing. Information +preparation refers to the handling of the information that is needed to run associations +[28]. Massive data from IoT require processing for data storing. Huge IoT information +is the high inspecting recurrence, this result in a tremendous measure of repeating or +amazingly comparative qualities. We suggest MapReduce based on a hybrid of FCM +and DBSCAN as a clustering algorithm to overcome the massive data storing problem. +MapReduce is considered as the most suitable technique to apply massive data +processing. +In FCM-DBSCAN Map function, first, we initialize minimum points that represent +minimum value of points in each cluster, epsilon value that represent the distance +between center and point, and membership matrix. Then, we calculate the centers of +clusters using equation +for each point in the dataset, the distance between +points and center of the cluster is calculated using equation d=ΣN +i=1 ΣC +j=1 Mm +ij ||xi – cj +||2. If the distance between point and center of cluster equal or greater than epsilon +value, this point marked as neighborPts to this cluster. Then, the neighbors points for +each center are calculated depending on epsilon value. If neighbor points for any +cluster are less than minimum points, then mark point as a noise else, the point marked +as clustered. We determine the key and create a new cluster. It repeats until reach to +convergence state. Finally, emit each point and each belonging cluster. + FCM-DBSCAN Map Function +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +17 +FCM-DBSCAN Map function +FCM-DBSCAN(D, eps, MinPts, M) +Initialize a number of clusters. +For each point P in dataset D +if P is visited +continue next point +mark P as visited +Calculate the center of clusters by equation +Calculate distance using equation d=ΣN +i=1 ΣC +j=1 Mm +ij ||xi – cj ||2. +For each p in D +Calculate neighborPts for each c based on eps +If d<= eps mark p as neighborPt +if sizeof(NeighborPts) < MinPts +mark P as NOISE +else Prepare the key and create new cluster C. +C = next cluster +expandCluster(P, C) +C.neighborPoints = NeighborPts +For each c +Calculate the new value of membership by equation M ij= 1/(Σ(||xi – cj|| / ||xi - ck||))2/(m-1) +Calculate the center of clusters by equation +if new center=old center then Break +End for +Emit(key, c) +End for +End for +End function + FCM-DBSCAN Reduce Function +FCM-DBSCAN Reduce function +FCM-DBSCAN Reduce function (key, c, eps, MinPts) +For all C clusters do +Set finalC.Points equal finalC.points ∪ C.points +For all P in C.neighborPoints do +if P′ is not visited +mark P′ as visited +Calculate NeighborPts′ for each P′ based on eps +If size of NeighborPts′ >= MinPts +set NeighborPts equal NeighborPts ∪ NeighborPts′ +End if +If P′ is not yet a member of any cluster +add P′ to cluster C +End if +End for +End for +Output: Set of clusters of data. +In FCM-DBSCAN Reduce function, the inputs are minimum points, epsilon value, +clusters, and keys. For each C cluster, the final cluster points equal to previous cluster +points in addition to the current cluster points. For all points in the cluster if a point +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +18 +marked as unvisited, then this point is marked as visited. The neighbor points are +calculated and compared with minimum points. If neighbor points are greater or equal +to a minimum point, then neighbor points are equal to neighbor points and cluster +points. Finally, the output is a set of a cluster of data. +As shown in figure 8, raw data is collected from different sensors, which results in +many problems, such as noisy, heterogeneous and massive data. Our proposed work +aim to solve these problems that face sensor data. The raw data is collected from +different sensors and stored. Then we applied preprocessing on this data. Then this +data is cleaned from noise by regression using KNN. We suggest KNN for dealing +with noisy data as it is very simple and can detect the most probable value than another +technique. Then, the cleared data reduced using SVD algorithm. It is very suitable for +reducing the high-dimensional data and for validating significant vision of data. +Therefore, data is sampled using Kennard sample. When applying the sampling, it +speeds the running time. We integrate the data come from heterogeneous sources +based on correlation, covariance matrices using mutual information matrix to detect +the relationship between elements in the dataset and predict semantic clusters. +In the data processing step, the proposed model is the MapReduce model based on +FCM-DBSCAN clustering technique. It is an intensity established clustering +algorithm, which gives an arrangement of entities in some space. It can discover the +clusters of diverse forms and sizes from a huge quantity of data without detecting +some clusters in the beginning. +5. THE EXPERIMENTAL RESULTS AND DISCUSSION +5.1 DATASET DESCRIPTION +The dataset includes ordinary IADL housekeeping activities [29]. These activities are +vacuuming, ironing, dusting, brooming, mopping, cleaning windows, making the bed, +watering plants, washing dishes, and setting the table. The general interval of the +dataset is 240 minutes. The intervals differ amongst some of the activities, indicating +the usual spreading of activities in daily life. They used the Porcupine sensor together +with the iBracelet to record both acceleration and RFID tag detections. The dataset +consists of the estimation of 1048576 records. We implement the proposed technique +on the dataset using Radoop, KNIME, and Matlab 2015b on Core(TM) 2 Due, 2 GH +processor, and 3 GB RAM. +5.2 RESULTS VIEW +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +19 +Figure 11. A part of the used dataset. +Figure 11 shows a part of the used dataset. The act represents activity label results +from ironing, vacuuming, brooming, making the bed, mopping, window cleaning, +watering plant, dish washing, and setting the table. Acc represent 3D Accelerator [x, y, +z] represented in Acc1, Acc2, Acc3, Lgt represent light, Tlt represent nine tilt data, Btn +represent annotation buttons, Rtc represent real time clock [ddmmyyhhmmss], and +Time represents elapsed number of seconds from the beginning of the recording. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +20 +Figure 12. The outlier detection. +Figure 12 shows the outlier detection. A new field called outlier appears. In the state of +finding an outlier, the value of this field is true otherwise the value is false. The outlier +is true when an observation is well outside of the expected scope of values in an +experiment. An outlier arises from variability in the measurement or experimental +error indication. The outliers are excluded from the dataset. +Figure 13. The outlier excluding and replacing the missing values. +Figure 13 shows that the outlier property has the values false for all the tuples, and the +missing values are replaced by the most probable value depending on KNN regression. +Figure14. The SVD deployment. +Figure 14 shows the applying of SVD algorithm that results in the reduction of the +dataset. The data represented using a smaller number of properties. The attribute with +high singular value has the priority to be presented. SVD1 has the highest probability +to present the data. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +21 +Figure 15. The mutual information matrix. +Figure 15 shows the outcome matrix from mutual information. A measure of the +variables mutual dependence is the trans-information of two variables. The mutual +information represents the rate of association or correlation between the row and +column variables. The mutual information partitioned data by 2N where N is the +sample size. The mutual information between items is used as a feature for clustering +to discover semantic clusters. When the value of mutual information is large, it +represents a high relationship between attributes. +5.3 RESULT VIEW OF MAPREDUCE PROCESSING. +Figure 16. The resulting attributes from Read dataset code. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +22 +Figure 17. The MapReduce function execution and read the resulted data after +MapReduce implementation. +Figure 16 shows the read data from MapReduce that observes a set of resulted +attributes view from IADL dataset after data preprocessing phase. Figure 17 shows the +MapReduce implementation. The dataset begins with no Map and no Reduce (Map is +0%, and Reduce is 0%) until Map becomes 100% and Reduce becomes 100%. Then, +we read the data result from MapReduce implementation. +5.4 EVALUATION +The evaluation observes the time and accuracy of preprocessing of the dataset. As +shown in Tables 2 and 3, the precision value is 99.3%, sensitivity value is 99.53%, and +the value of specificity is 85.52%. From the previous results and evaluation, we +conclude that the reduction step and FCM-DBSCAN enhanced the accuracy of the Big +data to be 98.9%. +Accuracy = +(11) +Precision = +(12) +Sensitivity (TP rate) = +(13) +Specificity (TN rate) = +(14) +TP True Positives: positive tuples correctly labeled +FP False Positives: negative tuples incorrectly labeled +TN True Negatives: negative tuples correctly labeled +FN False Negatives: positive tuples incorrectly labeled +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +23 +Table 1: The comparison between KM, Optics, EM, DBSCAN, and our proposed +system FCM-DBSCAN based on MapReduce model. +PCA PCA(Kernel) ICA SOM SVD +Time +(Sec.) +Accuracy +(%) +Time +(Sec.) +Accuracy +(%) +Time +(Sec.) +Accuracy +(%) +Time +(Sec.) +Accuracy +(%) +Time +(Sec.) +Accuracy +(%) +Performance +measure +k-Means 92 0.2 0.89 2 87 0.2 92.15 0.1 94.73 0.2 +0.79 +Optics 90.2 1.9 91 9.68 65.48 0.6 91.05 1.9 90 +EM 65.82 13 75.28 2.17 94.4 2.54 66.64 8 95.21 2 +DBSCAN 93.4 0.3 89.3 7.3 90.12 0.4 88.46 1 98 3.11 +FCM- 94.5 0.25 91.6 5.2 97.48 0.5 93.5 2.3 98.9 1.5 +DBSCAN +Table 1shows the comparison between different clustering algorithms as K-Means, +Optics, EM, DBSCAN, and the proposed approach FCM-DBSCAN. The clustering +algorithms are tested with different data reduction algorithms, such as PCA, +PCA(Kernel), ICA, SOM, and SVD. +In table 2 and table 3 we divided the dataset to training data and testing data, then we +evaluate the proposed approach on the tested data. +Table 2: The Positive and Negative matrix for the proposed system. +Predicted True False +Actual +Yes 9500 44 +No 66 390 +Table 3: The performance measure of our proposed system. +Recall 99.53% +Precision 99.3% +Sensitivity 99.53% +Specificity 85.52% +Accuracy 98.9% +F-measure 99.39% +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +24 +Time in +seconds +Clustering Techniques +Figure 18. The expended time comparison between different clustering technique +based on different reduction algorithms based on MapReduce model. +From our comparative studies that done in Table 1 and figure 18, we found that FCMDBSCAN +with its varied approaches for data reduction had the high accuracy value. +FCM-DBSCAN with SVD have the highest value of accuracy and retrieve data in a +small time. +K-Means and optics have nearest accuracy value, but optics has longer time. The EM +algorithm takes larger time than other techniques. The DBSCAN has high accuracy but +takes longer time. In FCM-DBSCAN, the accuracy increased and the expected time +decreased. +6.CONCLUSION +A massive amount of IoT data has been generated due to the vast increasing of +existing devices, sensors, actuators, and network communications. The resulting +massive IoT data is called "Big data." Big data refers to a massive data, which takes +much time to be processed. Therefore, we focused on clustering methodology rely on +MapReduce model to store data and recover results in a close real-time. We offer a +framework for processing massive and heterogeneous data in IoT. +This paper illustrated the Big data from IoT from many viewpoints. The raw dataset is +collected from different sensors, which leads to many problems, such as noisy, +heterogeneous, and massive data. Our proposed system aims to solve these problems +that face sensor data. The architecture of the proposed system consists of two main +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +K-Means Optics EM DBSCAN FCM-DBSCAN +PCA +PCA kernel +ICA +SOM +SVD +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +25 +stages: data preprocessing and data processing phases. In the preprocessing phase, we +used KNN to clean noisy data and replace missing data, which can use the most +probable value. The SVD is used to reduce data to save time. The mutual information +is implemented to detect the relationship between the data and detect semantic +clustering to achieve high accuracy and speed the running time. The MapReduce +model based on FCM-DBSCAN achieves data clustering by Map and Reduce +functions in a small time that resulted from using reduction technique before data +clustering. The processing time of the proposed system is 1.5 seconds, and the +accuracy is 98.9%. +In future work, we will implement the processing on different datasets and apply +different techniques using spark model that aims to speed the running time. Moreover, +we will implement data query processing using the best and suitable model of NoSQL +database. We suggest Key-value database. The Key-value (KV) stores use the +associative array, which is called a map. This approach can efficiently retrieve +selective key ranges. Also, we will address the challenges and deeply develop the big +data processing in cloud computing environments. +7. REFERENCES +[1] Li, T., Liu, Y., Tian, Y., Shen,S., & Mao, W. (2012). A storage solution for +massive IoT data based on NoSQL. IEEE International Conference on Green +Computing and Communications (GreenCom), Besancon, 50-57. +[2] Tsai, C., Lai, C., Chiang, M., & Yang, L. (2014). Data Mining for Internet of +Things: A Survey. IEEE Communications Surveys & Tutorials, 16(1), 77-97. +[3] Sharma, S. & Mangat, V. (2015). Technology and trends to handle big data: a +survey. 5th IEEE International Conference on Advanced Computing & +Communication Technologies (ACCT), Haryana, 266-271. +[4] Martha, V. S., Zhao, W., & Xu, X. (2013). h-MapReduce: a framework for +workload balancing in MapReduce. 27th IEEE International Conference on +Advanced Information Networking and Applications, 637-644. +[5] Dharni, C. & Bnasal, M. (2013). An Improvement of DBSCAN Algorithm to +Analyze Cluster for Large Datasets. IEEE International Conference on MOOC, +Innovation and Technology in Education (MITE), 42-46. +[6] Ghosh, S. & Kumar, S. (2013). Comparative Analysis of K-Means and Fuzzy CMeans +Algorithms. International Journal Of Advanced Computer Science And +Applications, 4(4), 35-39. +[7] Bora, D. & Gupta, D. (2014). A Comparative study Between Fuzzy Clustering +Algorithm and Hard Clustering Algorithm. International Journal Of Computer +Trends And Technology, 10(2), 108-113. +[8] Han, J., Kamber, M., & Pei, J. (2012). Data Mining Concepts and Techniques. +Third Edition, Elsevier, Chapter 9, 422- 425. +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +26 +[9] Singular Value Decomposition (SVD) tutorial. (2016). Web.mit.edu. Retrieved 7 +Jan 2016, from +http://web.mit.edu/be.400/www/SVD/Singular_Value_Decomposition.htm +[10] Tao, X. & Ji, C. (2014). Clustering massive small data for IOT. 2nd +International Conference on Systems and Informatics( ICSAI), Shanghai, 974- +978. +[11] Liancheng, X. & Jiao, X. (2014). Research on distributed data stream mining in +Internet of Things. International Conference On Logistics Engineering +Management And Computer Science (LEMCS), Atlantis Press, 149- 154. +[12] Wang, H., Lin, G., Wang, J., Gao, W., Chen, Y., & Duan, Q. (2014). +Management of Big Data in the Internet of Things in Agriculture Based on Cloud +Computing. AMM, 548-549, 1438-1444. +[13] Gole, S. & Tidke,B. (2015). Frequent itemset mining for big data in social media +using ClustBigFIM algorithm. IEEE International Conference on Pervasive +Computing (ICPC), Pune, 1-6. +[14] Mesiti, M.& Valtolina, S. (2014). Towards a user-friendly loading system for +the analysis of big data in The Internet Of Things. 38Th IEEE Annual +International Computers, Software, And Applications Conference Workshops +(COMPSACW), Vasteras, 312- 317. +[15] Zhang, G., Li,C., Zhang, Y., Xing, C., & Yang, J. (2012). An efficient massive +data processing model in the Cloud- A preliminary report. 7th ChinaGrid Annual +Conference, Beijing, 148-155. +[16] Galache, J., Yonezawa, T., Gurgen, L., Pavia, D., Grella, M., & Maeomichi, H. +(2014). ClouT: leveraging cloud computing techniques for improving +management of massive IoT data. 7th IEEE International Conference on Service- +Oriented Computing and Applications(SOCA), Matsue, 24-327. +[17] Sowe, S., Kimata, T., Dong, M., & Zettsu, K. (2014). Managing heterogeneous +sensor data on a big data platform: IoT services for data-intensive science. 38Th +IEEE Annual International Computers, Software, And Applications Conference +Workshops, Vasteras, 259-300. +[18] Cecchinel, C., Jimenez, M., Mosser, S., & Riveill, M. (2014). An architecture to +support the collection of big data in The Internet Of Things. 10Th IEEE World +Congress On Services, Anchorage, AK, 442-449. +[19] Mishra, N., Lin, C., & Chang, H. (2014). A cognitive-oriented framework for +IoT big-data management prospective. IEEE International Conference +Communication Problem-Solving (ICCP), Beijing, 124-127. +[20] What is Data Preprocessing? - Definition from Techopedia. (2015). Techopedia. +com. Retrieved 9 July 2015, from +http://www.techopedia.com/definition/14650/data-preprocessing +ISSN:0254-0223 Vol. 31 (n. 7, 2016) +27 +[21] Tang, N. (2015). Big RDF data cleaning. 31st IEEE International Conference +on Data Engineering Workshops (ICDEW), Seoul, 77-79 . +[22] Shoaip, N., Elmogy, M., Riad, A., & Badria, F. (2015). Missing Data Treatment +Using Interval-valued Fuzzy Rough Sets with SVM. International Journal of +Advancements in Computing Technology(IJACT), 7(5), 37-48. +[23] Sadeghzadeh, K. & Fard, N. (2015). Nonparametric data reduction approach for +large-scale survival data analysis. IEEE Reliability and Maintainability +Symposium (RAMS), Palm Harbor, 1 – 6. +[24] Katole, S. & Karmore, S. (2015). A new approach of microarray data dimension +reduction for medical applications. 2nd IEEE International Conference on +Electronics and Communication Systems (ICECS), Coimbatore, 409-413. +[25] Saranya, K., Hema, M., & Chandramathi, S. (2014). Data fusion in ontology +based data integration. IEEE International Conference on Information +Communication and Embedded Systems (ICICES), Chennai, Tamil Nadu, India, +1-6. +[26] Pal, K. (2015). How to Address Common Big Data Pain Points. Data Informed. +Retrieved 8 July 2015, from http://data-informed.com/how-to-address-commonbig- +data-pain-points +[27] Cover, T. & Thomas, J. (2012). Elements of information theory. Second Edition, +John Wiley & Sons, Chapter 2, 19-22 . +[28] Encyclopedia Britannica: data processing | computer science. (2015). +Encyclopedia Britannica. Retrieved 7 July 2015, from +http://www.britannica.com/technology/data-processing +[29] ADL Recognition Based on the Combination of RFID and Accelerometer +Sensing | Embedded Sensing Systems - www.ess.tu-darmstadt.de. (2015). Ess.tudarmstadt. +de. Retrieved 17 August 2015, from http://www.ess.tudarmstadt. +de/datasets/PHealth08-ADL +View publication stats \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..0a536999d8e50ea8d27a89590e46ce5681b6b4b0 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/A BIG DATA PROCESSING FRAMEWORK BASED ON MAPREDUCE WITH APPLICATION TO INTERNET OF THINGS.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" new file mode 100644 index 0000000000000000000000000000000000000000..2cc94323d42dfbe3723c7bf8d460d5f12d03f6b2 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt" @@ -0,0 +1,152 @@ +Apache Hadoop Architecture – HDFS, YARN & MapReduce +Explore the architecture of Hadoop, which is the most adopted framework for storing and processing massive data. + +In this article, we will study Hadoop Architecture. The article explains the Hadoop architecture and the components of Hadoop architecture that are HDFS, MapReduce, and YARN. In the article, we will explore the Hadoop architecture in detail, along with the Hadoop Architecture diagram. + +Let us now start with Hadoop Architecture. + +Hadoop Architecture +hadoop architecture + +The goal of designing Hadoop is to develop an inexpensive, reliable, and scalable framework that stores and analyzes the rising big data. + +Apache Hadoop is a software framework designed by Apache Software Foundation for storing and processing large datasets of varying sizes and formats. + +Hadoop follows the master-slave architecture for effectively storing and processing vast amounts of data. The master nodes assign tasks to the slave nodes. + +The slave nodes are responsible for storing the actual data and performing the actual computation/processing. The master nodes are responsible for storing the metadata and managing the resources across the cluster. + +Slave nodes store the actual business data, whereas the master stores the metadata. + +The Hadoop architecture comprises three layers. They are: + +Storage layer (HDFS) +Resource Management layer (YARN) +Processing layer (MapReduce) +hadoop component - hadoop architecture + +The HDFS, YARN, and MapReduce are the core components of the Hadoop Framework. + +Let us now study these three core components in detail. + +1. HDFS +HDFS Architecture + +HDFS is the Hadoop Distributed File System, which runs on inexpensive commodity hardware. It is the storage layer for Hadoop. The files in HDFS are broken into block-size chunks called data blocks. + +These blocks are then stored on the slave nodes in the cluster. The block size is 128 MB by default, which we can configure as per our requirements. + +Like Hadoop, HDFS also follows the master-slave architecture. It comprises two daemons- NameNode and DataNode. The NameNode is the master daemon that runs on the master node. The DataNodes are the slave daemon that runs on the slave nodes. + +NameNode +NameNode stores the filesystem metadata, that is, files names, information about blocks of a file, blocks locations, permissions, etc. It manages the Datanodes. + +DataNode +DataNodes are the slave nodes that store the actual business data. It serves the client read/write requests based on the NameNode instructions. + +DataNodes stores the blocks of the files, and NameNode stores the metadata like block locations, permission, etc. + +2. MapReduce +apache hadoop mapreduce + +It is the data processing layer of Hadoop. It is a software framework for writing applications that process vast amounts of data (terabytes to petabytes in range) in parallel on the cluster of commodity hardware. + +The MapReduce framework works on the pairs. + +The MapReduce job is the unit of work the client wants to perform. MapReduce job mainly consists of the input data, the MapReduce program, and the configuration information. Hadoop runs the MapReduce jobs by dividing them into two types of tasks that are map tasks and reduce tasks. The Hadoop YARN scheduled these tasks and are run on the nodes in the cluster. + +Due to some unfavorable conditions, if the tasks fail, they will automatically get rescheduled on a different node. + +The user defines the map function and the reduce function for performing the MapReduce job. + +The input to the map function and output from the reduce function is the key, value pair. + +The function of the map tasks is to load, parse, filter, and transform the data. The output of the map task is the input to the reduce task. Reduce task then performs grouping and aggregation on the output of the map task. + +The MapReduce task is done in two phases- + +1. Map phase +a. RecordReader + +Hadoop divides the inputs to the MapReduce job into the fixed-size splits called input splits or splits. The RecordReader transforms these splits into records and parses the data into records but it does not parse the records itself. RecordReader provides the data to the mapper function in key-value pairs. + +b. Map + +In the map phase, Hadoop creates one map task which runs a user-defined function called map function for each record in the input split. It generates zero or multiple intermediate key-value pairs as map task output. + +The map task writes its output to the local disk. This intermediate output is then processed by the reduce tasks which run a user-defined reduce function to produce the final output. Once the job gets completed, the map output is flushed out. + +c. Combiner + +Input to the single reduce task is the output from all the Mappers that is output from all map tasks. Hadoop allows the user to define a combiner function that runs on the map output. + +Combiner groups the data in the map phase before passing it to Reducer. It combines the output of the map function which is then passed as an input to the reduce function. + +d. Partitioner + +When there are multiple reducers then the map tasks partition their output, each creating one partition for each reduce task. In each partition, there can be many keys and their associated values but the records for any given key are all in a single partition. + +Hadoop allows users to control the partitioning by specifying a user-defined partitioning function. Generally, there is a default Partitioner that buckets the keys using the hash function. + +2. Reduce phase: +The various phases in reduce task are as follows: + +a. Sort and Shuffle: + +The Reducer task starts with a shuffle and sort step. The main purpose of this phase is to collect the equivalent keys together. Sort and Shuffle phase downloads the data which is written by the partitioner to the node where Reducer is running. + +It sorts each data piece into a large data list. The MapReduce framework performs this sort and shuffles so that we can iterate over it easily in the reduce task. + +The sort and shuffling are performed by the framework automatically. The developer through the comparator object can have control over how the keys get sorted and grouped. + +b. Reduce: + +The Reducer which is the user-defined reduce function performs once per key grouping. The reducer filters, aggregates, and combines data in several different ways. Once the reduce task is completed, it gives zero or more key-value pairs to the OutputFormat. The reduce task output is stored in Hadoop HDFS. + +c. OutputFormat + +It takes the reducer output and writes it to the HDFS file by RecordWriter. By default, it separates key, value by a tab and each record by a newline character. + +hadoop mapreduce - hadoop architecture + +3. YARN +YARN stands for Yet Another Resource Negotiator. It is the resource management layer of Hadoop. It was introduced in Hadoop 2. + +YARN is designed with the idea of splitting up the functionalities of job scheduling and resource management into separate daemons. The basic idea is to have a global ResourceManager and application Master per application where the application can be a single job or DAG of jobs. + +YARN consists of ResourceManager, NodeManager, and per-application ApplicationMaster. + +apache hadoop yarn +1. ResourceManager +It arbitrates resources amongst all the applications in the cluster. + +It has two main components that are Scheduler and the ApplicationManager. + +a. Scheduler + +The Scheduler allocates resources to the various applications running in the cluster, considering the capacities, queues, etc. +It is a pure Scheduler. It does not monitor or track the status of the application. +Scheduler does not guarantee the restart of the failed tasks that are failed either due to application failure or hardware failure. +It performs scheduling based on the resource requirements of the applications. +b. ApplicationManager + +They are responsible for accepting the job submissions. +ApplicationManager negotiates the first container for executing application-specific ApplicationMaster. +They provide service for restarting the ApplicationMaster container on failure. +The per-application ApplicationMaster is responsible for negotiating containers from the Scheduler. It tracks and monitors their status and progress. +2. NodeManager: +NodeManager runs on the slave nodes. It is responsible for containers, monitoring the machine resource usage that is CPU, memory, disk, network usage, and reporting the same to the ResourceManager or Scheduler. + +3. ApplicationMaster: +The per-application ApplicationMaster is a framework-specific library. It is responsible for negotiating resources from the ResourceManager. It works with the NodeManager(s) for executing and monitoring the tasks. + +Summary +In this article, we have studied Hadoop Architecture. The Hadoop follows master-slave topology. The master nodes assign tasks to the slave nodes. The architecture comprises three layers that are HDFS, YARN, and MapReduce. + +HDFS is the distributed file system in Hadoop for storing big data. MapReduce is the processing framework for processing vast data in the Hadoop cluster in a distributed manner. YARN is responsible for managing the resources amongst applications in the cluster. + +The HDFS daemon NameNode and YARN daemon ResourceManager run on the master node in the Hadoop cluster. The HDFS daemon DataNode and the YARN NodeManager run on the slave nodes. + +HDFS and MapReduce framework run on the same set of nodes, which result in very high aggregate bandwidth across the cluster. + +Keep Learning!! \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..5a7cac9932dabcd8a0bb4b7d70da7b279c2db650 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Apache Hadoop Architecture \342\200\223 HDFS, YARN & MapReduce.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt new file mode 100644 index 0000000000000000000000000000000000000000..a36ff99b097e67a735ba4cb66ee5fdc0f9a981d1 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt @@ -0,0 +1,146 @@ +See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/311558422 +Big Data Analysis: Challenges and Solutions +Conference Paper · December 2013 +CITATIONS +40 +READS +4,637 +2 authors: +Some of the authors of this publication are also working on these related projects: +Soft Computing Approach for Diabetes Diagnosis View project +Prediction of thunderstorm and lightning using soft computing and data mining View project +Puneet Singh Duggal +Birla Institute of Technology, Mesra +2 PUBLICATIONS 43 CITATIONS +SEE PROFILE +Sanchita Paul +Birla Institute of Technology, Mesra +49 PUBLICATIONS 402 CITATIONS +SEE PROFILE +All content following this page was uploaded by Puneet Singh Duggal on 10 December 2016. +The user has requested enhancement of the downloaded file. +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +269 +Big Data Analysis: Challenges and Solutions Puneet Singh Duggal Department of Computer Science & Engineering Birla Institute of Technology, Mesra, Ranchi, India duggal@gmail.com +Sanchita Paul Department of Computer Science & Engineering Birla Institute of Technology Mesra, Ranchi, India +sanchita07@gmail.com Abstract—We live in on-demand, on-command Digital universe with data prolifering by Institutions, Individuals and Machines at a very high rate. This data is categories as "Big Data" due to its sheer Volume, Variety, Velocity and Veracity. Most of this data is unstructured, quasi structured or semi structured and it is heterogeneous in nature. The volume and the heterogeneity of data with the speed it is generated, makes it difficult for the present computing infrastructure to manage Big Data. Traditional data management, warehousing and analysis systems fall short of tools to analyze this data. Due to its specific nature of Big Data, it is stored in distributed file system architectures. Hadoop and HDFS by Apache is widely used for storing and managing Big Data. Analyzing Big Data is a challenging task as it involves large distributed file systems which should be fault tolerant, flexible and scalable. Map Reduce is widely been used for the efficient analysis of Big Data. Traditional DBMS techniques like Joins and Indexing and other techniques like graph search is used for classification and clustering of Big Data. These techniques are being adopted to be used in Map Reduce. In this research paper the authors suggest various methods for catering to the problems in hand through Map Reduce framework over Hadoop Distributed File System (HDFS). Map Reduce is a Minimization technique which makes use of file indexing with mapping, sorting, shuffling and finally reducing. Map Reduce techniques have been studied at in this paper which is implemented for Big Data analysis using HDFS. Keyword-Big Data Analysis, Big Data Management, Map Reduce, HDFS +I. INTRODUCTION +Big Data encompasses everything from click stream data from the web to genomic and proteomic data from biological research and medicines. Big Data is a heterogeneous mix of data both structured (traditional datasets –in rows and columns like DBMS tables, CSV's and XLS's) and unstructured data like e-mail attachments, manuals, images, PDF documents, medical records such as x-rays, ECG and MRI images, forms, rich media like graphics, video and audio, contacts, forms and documents. Businesses are primarily concerned with managing unstructured data, because over 80 percent of enterprise data is unstructured [26] and require significant storage space and effort to manage.“Big data” refers to datasets whose size is beyond the ability of typical database software tools to capture, store, manage, and analyse [3]. +Big data analyticsis the area where advanced analytic techniques operate on big data sets. It is really about two things, Big data and Analytics and how the two have teamed up to create one of the most profound trends in business intelligence (BI) [4]. Map Reduce by itself is capable for analysing large distributed data sets; but due to the heterogeneity, velocity and volume of Big Data, it is a challenge for traditional data analysis and management tools [1] [2]. A problem with Big Data is that they use NoSQL and has no Data Description Language (DDL) and it supports transaction processing. Also, web-scale data is not universal and it is heterogeneous. For analysis of Big Data, database integration and cleaning is much harder than the traditional mining approaches [4]. Parallel processing and distributed computing is becoming a standard procedure which are nearly non-existent in RDBMS. Map Reduce has following characteristics [12]; it supports Parallel and distributed processing, it is simple and its architecture is shared-nothing which has commodity diverse hardware (big cluster).Its functions are programmed in a high-level programming language (e.g. Java, Python) and it is flexible. Query processing is done through NoSQL integrated in HDFS as Hive tool [20]. Analytics helps to discover what has changed and the possible solutions. Second, advanced analytics is the best way to discover more business opportunities, new customer segments, identify the best suppliers, associate products of affinity, understand sales seasonality[25] etc. Traditional experience in data warehousing, reporting, and online analytic processing (OLAP) is different for advanced forms of analytics [6]. Organizations are implementing specific forms of analytics, particularly called advanced analytics. These are an collection of related techniques and tool types, usually including predictive analytics, data mining, statistical analysis, complex SQL, data visualization, artificial intelligence, natural language processing. Database analytics platforms such as MapReduce, in-database analytics, in-memory databases, and columnar data stores [6] [9] are used for standardizing them. +With big data analytics, the user is trying to discover new business facts that no one in the enterprise knew before, a better term would be “discovery analytics. To do that, the analyst needs large volumes of data with plenty of detail. This is often data that the enterprise has not yet tapped for analytics example, the log data. The analyst might mix that data with historic data from a data warehouse and would discover for example, new change behaviour in a subset of the customer base. The discovery would lead to a metric, report, analytic model, or some other product of BI, through which the company could track and predict the new form of customer behavioural change. +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +270 +Discovery analytics against big data can be enabled by different types of analytic tools, including those based on SQL queries, data mining, statistical analysis, fact clustering, data visualization, natural language processing, text analytics, artificial intelligence etc [4-6]. A unique challenge for researchers system and academicians is that the large datasets needs special processing systems [5]. Map Reduce over HDFS gives Data Scientists [1-2] the techniques through which analysis of Big Data can be done. HDFS is a distributed file system architecture which encompasses the original Google File System [13].Map Reduce jobs use efficient data processingtechniques which can be applied in each of the phases of MapReduce; namely Mapping, Combining, Shuffling,Indexing, Grouping and Reducing [7]. All these techniques have been studied in this paper for implementation in Map Reduce tasks. +II. BIG DATA : OPPORTUNITIES AND CHALLENGES +In the distributed systems world, “Big Data” started to become a major issue in the late 1990‟s due to the impact of the world-wide Web and a resulting need to index and query its rapidly mushrooming content. Database technology (including parallel databases) was considered for the task, but was found to be neither well-suited nor cost-effective [5] for those purposes. The turn of the millennium then brought further challenges as companies began to use information such as the topology of the Web and users‟ search histories in order to provide increasingly useful search results, as well as more effectively-targeted advertising to display alongside and fund those results. Google‟s technical response to the challenges of Web-scale data management and analysis was simple, by database standards, but kicked off what has become the modern “Big Data” revolution in the systems world [3]. To handle the challenge of Web-scale storage, the Google File System (GFS) was created [13]. GFS provides clients with the familiar OS-level byte-stream abstraction, but it does so for extremely large files whose content can span hundreds of machines in shared-nothing clusters created using inexpensive commodity hardware [5]. To handle the challenge of processing the data in such large files, Google pioneered its Map Reduce programming model and platform [1][13]. This model, characterized by some as “parallel programming for dummies”, enabled Google‟s developers to process large collections of data by writing two user-defined functions, map and reduce, that the Map Reduce framework applies to the instances (map) and sorted groups of instances that share a common key (reduce) – similar to the sort of partitioned parallelism utilized in shared-nothing parallel query processing. +Driven by very similar requirements, software developers at Yahoo!, Facebook, and other large Web companies followed suit. Taking Google‟s GFS and Map Reduce papers as rough technical specifications, open-source equivalents were developed, and the Apache Hadoop Map Reduce platform and its underlying file system (HDFS, the Hadoop Distributed File System) were born [1] [12]. The Hadoop system has quickly gained traction, and it is now widely used for use cases including Web indexing, clickstream and log analysis, and certain large-scale information extraction and machine learning tasks. Soon tired of the low-level nature of the Map Reduce programming model, the Hadoop community developed a set of higher-level declarative languages for writing queries and data analysis pipelines that are compiled into Map Reduce jobs and then executed on the Hadoop Map Reduce platform. Popular languages include Pig from Yahoo! [18], Jaql from IBM [28], and Hive from Facebook [18]. Pig is relational-algebra-like in nature, and is reportedly used for over 60% of Yahoo!‟s MapReduce use cases; Hive is SQL-inspired and reported to be used for over 90% of the Facebook Map Reduce use cases. Microsoft‟s technologies include a parallel runtime system called Dryad and two higher-level programming models, Dryad LINQ and the SQLlike SCOPE language [27], which utilizes Dryad under the covers. Interestingly, Microsoft has also recently announced that its future “Big Data” strategy includes support for Hadoop[24]. +III. HADOOP AND HDFS +Hadoop is a scalable, open source, fault-tolerant Virtual Grid operating system architecture for data storage and processing. It runs on commodity hardware, it uses HDFS which is fault-tolerant high-bandwidth clustered storage architecture. It runs MapReduce for distributed data processing and is works with structured and unstructured data. +Figure1Illustrates the layers found in the software architecture of aHadoop stack [17] [19]. At the bottom of the Hadoop software stack is HDFS, a distributed file system in which each file appears as a (very large) contiguous and randomly addressable sequence of bytes. For batch analytics, the middle layer of the stack is the Hadoop Map Reduce system, which applies map operations to the data in partitions of an HDFS file, sorts and redistributes the results based on key values in the output data, and then performs reduce operations on the groups of output data items with matching keys from the map phase of the job. For applications just needing basic key-based record management operations, the HBase store (layered on top of HDFS) is available as a key-value layer in the Hadoop stack. As indicated in the figure, the contents of HBase can either be directly accessed and manipulated by a client application or accessed via Hadoop for analytical needs. Many users of the Hadoop stack prefer the use of a declarative language over the bare MapReduce programming model. High-level language compilers (Pig and Hive) are thus the topmost layer in the Hadoop software stack for such clients. +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +271 +Figure 1.Hadoop Architecture Layers Figure 2.Hadoop Architecture Tools and usage +Figure 3. HDFS Clusters +Figure2 shows the relevancy between the traditional experience in data warehousing, reporting, and online analytic processing (OLAP) and advanced analytics with collection of related techniques like data mining with DBMS, artificial intelligence, machine learning, and database analytics platforms such as MapReduce and Hadoop over HDFS [4] [9]. Figure 3 shows the architecture of HDFS clusters implementation with Hadoop. It can be seen that HDFS has distributed the task over two parallel clusters with one server and two slave nodes each. Data analysis tasks are distributed in these clusters. +IV. BIG DATA ANALYSIS +Heterogeneity, scale, timeliness, complexity, and privacy problems with Big Data hamper the progress at all phases of the process that can create value from data. Much data today is not natively in structured format; for example, tweets and blogs are weakly structured pieces of text, while images and video are structured for storage and display, but not for semantic content and search: transforming such content into a structured format for later analysis is a major challenge [15]. The value of data enhances when it can be linked with other data, thus data integration is a major creator of value. Since most data is directly generated in digital format today, we have the opportunity and the challenge both to influence the creation to facilitate later linkage and to automatically link previously created data. Data analysis, organization, retrieval, and modelling are other foundational challenges [6]. Big Data analysis is a clear bottleneck in many applications, both due to lack of scalability of the underlying algorithms and due to the complexity of the data that needs to be analysed. Finally, presentation of the results and its interpretation by non-technical domain experts is crucial to extracting actionable knowledge as most of the BI related jobs are handled by statisticians and not software experts. +Figure 4, below gives a glimpse of the Big Data analysis tools which are used for efficient and precise data analysis and management jobs. The Big Data Analysis and management setup can be understood through the layered structured defined in the figure. The data storage part is dominated by the HDFS distributed file system architecture; other mentioned architectures available are Amazon Web Service (AWS) [23], Hbase and CloudStore etc. The data processing tasks for all the tools is Map Reduce; we can comfortably say that it is the de-facto Data processing tool used in the Big Data paradigm. +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +272 +Figure 4. Big Data Analysis Tools +For handling the velocity and heterogeneity of data, tools like Hive, Pig and Mahout are used which are parts of Hadoop and HDFS framework. It is interesting to note that for all the tools used, Hadoop over HDFS is the underlying architecture. Oozie and EMR with Flume and Zookeeper are used for handling the volume and veracity of data, which are standard Big Data management tools. The layer with their specified tools forms the bedrock for Big Data management and analysis framework. +V. MAP REDUCE +MapReduce [1-2] is a programming model for processing large-scale datasets in computer clusters. The MapReduce programming model consists of two functions, map() and reduce(). Users can implement their own processing logic by specifying a customized map() and reduce() function. The map() function takes an input key/value pair and produces a list of intermediate key/value pairs. The MapReduce runtime system groups together all intermediate pairs based on the intermediate keys and passes them to reduce() function for producing the final results. Map (in_key, in_value) --->list(out_key,intermediate_value) Reduce (out_key,list(intermediate_value)) -- ->list(out_value) The signatures of map() and reduce() are as follows : map (k1,v1) ! list(k2,v2)and reduce (k2,list(v2)) ! list(v2) +A MapReduce cluster employs a master-slave architecture where one master node manages a number of slave nodes [19]. In the Hadoop, the master node is called JobTracker and the slave node is called TaskTracker as shown in the figure 7. Hadoop launches a MapReduce job by first splitting the input dataset into even-sized data blocks. Each data block is then scheduled to one TaskTracker node and is processed by a map task. The TaskTracker node notifies the JobTracker when it is idle. The scheduler then assigns new tasks to it. The scheduler takes data locality into account when it disseminates data blocks. +Figure 5. Map Reduce Architecture and Working It always tries to assign a local data block to a TaskTracker. If the attempt fails, the scheduler will assign a rack-local or random data block to the TaskTracker instead. When map() functions complete, the runtime system groups all intermediate pairs and launches a set of reduce tasks to produce the final results. Large scale data processing is a difficult task, managing hundreds or thousands of processors and managing parallelization and distributed environments makes is more difficult. Map Reduce provides solution to the mentioned issues, as is supports distributed and parallel I/O scheduling, it is fault tolerant and supports scalability and i has inbuilt processes for status and monitoring of heterogeneous and large datasets as in Big Data [14]. +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +273 +A. Map Reduce Components +1. Name Node – manages HDFS metadata, doesn‟t deal with files directly +2. Data Node – stores blocks of HDFS – default replication level for each block: 3 +3. Job Tracker – schedules, allocates and monitors job execution on slaves – Task Trackers +4. Task Tracker – runs Map Reduce operations +Figure 6.Map Reduce Components +B. Map Reduce Working +We implement the Mapper and Reducer interfaces to provide the map and reduce methods as shown in figure 6. These form the core of the job. +1) Mapper +Mapper maps input key/value pairs to a set of intermediate key/value pairs. Maps are the individual tasks that transform input records into intermediate records. The transformed intermediate records do not need to be of the same type as the input records. A given input pair may map to zero or many output pairs [19]. The number of maps is usually driven by the total size of the inputs, that is, the total number of blocks of the input files. The right level of parallelism for maps seems to be around 10-100 maps per-node, although it has been set up to 300 maps for very cpu-light map tasks. Task setup takes awhile, so it is best if the maps take at least a minute to execute. For Example, if you expect 10TB of input data and have a blocksize of 128MB, you'll end up with 82,000 maps [17] [19]. +2) Reducer +Reducer reduces a set of intermediate values which share a key to a smaller set of values. Reducer has 3 primary phases: shuffle, sort and reduce. +2.1) Shuffle +Input to the Reducer is the sorted output of the mappers. In this phase the framework fetches the relevant partition of the output of all the mappers, via HTTP. +2.2) Sort +The framework groups Reducer inputs by keys (since different mappers may have output the same key) in this stage. The shuffle and sort phases occur simultaneously; while map-outputs are being fetched they are merged. +2.3) Secondary Sort +If equivalence rules for grouping the intermediate keys are required to be different from those for grouping keys before reduction, then one may specify a Comparator (Secondary Sort ). +2.4) Reduce +In this phase the reduce method is called for each pair in the grouped inputs.The output of the reduce task is typically written to the File System via Output Collector[19]. +Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. The output of the Reducer is not sorted. The right number of reduces seems to be 0.95 or 1.75 multiplied by no. of nodes. With 0.95 all of the reduces can launch immediately and start transferring map outputs as the maps finish. With 1.75 the faster nodes will finish their first round of reduces and launch a second wave of reduces doing a much better job of load balancing [MR Framework].Increasing the number of reduces increases the framework overhead, but increases load balancing and lowers the cost of failures. The scaling factors above are slightly less than whole numbers to reserve a few reduce slots in the framework for speculative-tasks and failed tasks. It is legal to set the number of reduce-tasks to zero if no reduction is desired. +a) Partitioner +Partitioner partitions the key space. Partitioner controls the partitioning of the keys of the intermediate map-outputs. The key (or a subset of the key) is used to derive the partition, typically by a hash function. The total number of partitions is the same as the number of reduce tasks for the job. Hence this controls which of the m reduce tasks the intermediate key (and hence the record) is sent to for reduction. +Hash Partitioner is the default Partitioner. +b) Reporter +Reporter is a facility for MapReduce applications to report progress, set application-level status messages and update Counters.Mapper and Reducer implementations can use the Reporter to report progress or just indicate that they are alive. In scenarios where the application takes a significant amount of time to process individual key/value pairs, this is crucial since the framework might assume that the task has timed-out and kill that task. Applications can also update Counters using the Reporter. +c) Output Collector +Output Collector is a generalization of the facility provided by the MapReduce framework to collect data output by the Mapper or the Reducer (either the intermediate outputs or the +Name Node–manages HDFS metadata, doesn’t deal with files directly +Data Node –stores blocks of HDFS –default replication level for each block: 3 +Job Tracker –schedules, allocates and monitors job execution on slaves –Task Trackers +Task Tracker –runs Map Reduce operations +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +274 +output of the job). HadoopMapReduce comes bundled with a library of generally useful mappers, reducers, and partitioners +Figure 7. Map Reduce Working through Master / Slave +C. Map Reduce techniques + Combining +Combiners provide a general mechanism within the MapReduce framework to reduce the amount of intermediate data generated by the mappers. They can be understood as "mini-reducers" that process the output of mappers. The combiner's aggregate term counts across the documents processed by each map task. This result in a reduction in the number of intermediate key-value pairs that need to be shuffled across the network, from the order of total number of terms in the collection to the order of the number of unique terms in the collection. They reduce the result size of map functions and perform reduce-like function in each machine which decreases the shuffling cost. + Inverse Indexing +Inverse indexing is a technique in which the keywords of the documents are mapped according to the document keys in which they are residing. For example Doc1: IMF, Financial Economics Crisis Doc2: IMF, Financial Crisis Doc3: Harry Economics Doc4: Financial Harry Potter Film Doc5: Harry Potter Crisis The following is the inverted index of the above data IMF -> Doc1:1, Doc2:1 Financial -> Doc1:6, Doc2:6, Doc4:1 Economics -> Doc1:16, Doc3:7 Crisis -> Doc1:26, Doc2:16, Doc5:14 Harry -> Doc3:1, Doc4:11, Doc5:1 Potter -> Doc4:17, Doc5:7 Film -> Doc4:24 + Shuffling +Shuffling is the procedure of mixing the indexes of the files and their keys, so that a heterogeneous mix of dataset can be obtained. If the dataset is shuffled, then there are better chances that the resultant query processing will yield near accurate results. We can relate the shuffling process with the population generating by crossover in the GA algorithms. The processes are different in nature, but their purpose is similar.[7] + Sharding +It is a term used to distribute the Mappers in the HDFS architecture. Sharding refers to the groupings or documents which are done so that the MapReduce jobs are done parallel in a distributed environment. + Joins +Join is a RDBMS term; it refers to combining two or more discrete datasets to get Cartesian product of data of all the possible combinations. Map Reduce does not have its own Join techniques, but RDBMS techniques are tweaked and used to get the maximum possible combinations.The join techniques which are adopted for Map Reduce are Equi Join, Self Join, Repartition Join and Theta Join [7][10-11]. + Clustering & Classification +They are Data Analysis term, used mainly in Data Mining. In Map Reduce it is achieved through K means clustering [7]. Here, iterative working improves partitioning of data into k clusters. After the clustering, the data sorted are grouped together based upon rules to be formed into classes. The steps for clustering in Map Reduce are; Step1: Do Step2: Map Step3: Input is a data point and k centres are broadcasted Step4: Finds the closest centre among k centres for the input point Step5: Reduce Step6: Input is one of k centres and all data points having this centre as their closest centre Step7: Calculates the new centre using data points Step 8: Repeat 1-7, until all of new centres are not changed +VI. CONCLUSION +The need to process enormous quantities of data has never been greater. Not only are terabyte- and petabyte-scale datasets rapidly becoming commonplace, but there is consensus that great value lies buried in them, waiting to be unlocked by the right computational tools. In the commercial sphere, business intelligence, driven by the ability to gather data from a dizzying array of sources. Big Data analysis tools like Map Reduce over Hadoop and HDFS, promises to help organizations better understand their customers and the marketplace, hopefully leading to better business decisions and competitive advantages [6]. For engineers building information processing tools and applications, large and heterogeneous datasets which are generating continuous flow of data, lead to more effective algorithms for a wide range of tasks, from machine translation +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +275 +to spam detection. In the natural and physical sciences, the ability to analyse massive amounts of data may provide the key to unlocking the secrets of the cosmos or the mysteries of life. MapReduce can be exploited to solve a variety of problems related to text processing at scales that would have been unthinkable a few years ago [15]. No tool no matter how powerful or flexible can be perfectly adapted to every task. There are many examples of algorithms that depend crucially on the existence of shared global state during processing, making them difficult to implement in MapReduce (since the single opportunity for global synchronization in MapReduce is the barrier between the map and reduce phases of processing). Implementing online learning algorithms in MapReduce is problematic [14]. The model parameters in a learning algorithm can be viewed as shared global state, which must be updated as the model is evaluated against training data. All processes performing the evaluation (presumably the mappers) must have access to this state. In a batch learner, where updates occur in one or more reducers (or, alternatively, in the driver code), synchronization of this resource is enforced by the MapReduce framework. However, with online learning, these updates must occur after processing smaller numbers of instances. This means that the framework must be altered to support faster processing of smaller datasets, which goes against the design choices of most existing MapReduce implementations. Since MapReduce was specifically optimized for batch operations over large amounts of data, such a style of computation would likely result in insufficient use of resources [2]. In Hadoop, for example, map and reduce tasks have considerable start-up costs. +VII. ADVANCEMENTS +Streaming algorithms [9] represent an alternative programming model for dealing with large volumes of data with limited computational and storage resources. This model assumes that data are presented to the algorithm as one or more streams of inputs that are processed in order, and only once. Stream processing is very attractive for working with time-series data (news feeds, tweets, sensor readings, etc.), which is difficult in MapReduce (once again, given its batch-oriented design). Another system worth mentioning is Pregel [16], which implements a programming model inspired by Valiant's Bulk Synchronous Parallel (BSP) model. Pregel was specially designed for large-scale graph algorithms, but unfortunately there are few published details at present. +Pig [28], which is inspired by Google [13], can be described as a data analytics platform that provides a lightweight scripting language for manipulating large datasets. Although Pig scripts (in a language called Pig Latin) are ultimately converted into Hadoop jobs by Pig's execution engine through joins, allow developers to specify data transformations (filtering, joining, grouping, etc.) at a much higher level. Similarly, Hive [20], another open-source project, provides an abstraction on top of Hadoop that allows users to issue SQL queries against large relational datasets stored in HDFS. Hive queries, in HiveQL are compiled down to Hadoop jobs by the Hive query engine. Therefore, the system provides a data analysis tool for users who are already comfortable with relational databases, while simultaneously taking advantage of Hadoop's data processing capabilities [11]. The power of MapReduce derives from providing an abstraction that allows developers to harness the power of large clusters but abstractions manage complexity by hiding details and presenting well-defined behaviours to users of those abstractions. This process makes certain tasks easier, but others more difficult, if not impossible. MapReduce is certainly no exception to this generalization, even within the Hadoop/HDFS/MapReduceecosystem; it is already observed the development of alternative approaches for expressing distributed computations. For example, there can be a third merge phase after map and reduce to better support relational operations. Join processing mentioned n the paper can also tackle the Map Reduce tasks effectively. The future directions in Big Data analysis gives a very encouraging picture as the tools are build on the existing paradigm of HDFS and Hadoop, overcoming the existing drawback of the present systems and the advantages it provides over the traditional data analysis tools. +REFERENCES +[1] Jefry Dean and Sanjay Ghemwat, MapReduce:A Flexible Data Processing Tool, Communications of the ACM, Volume 53, Issuse.1,January 2010, pp 72-77. +[2] Jefry Dean and Sanjay Ghemwat,.MapReduce: Simplified data processing on large clusters, Communications of the ACM, Volume 51 pp. 107–113, 2008 +[3] Brad Brown, Michael Chui, and James Manyika, Are you ready for the era of „big data‟?,McKinseyQuaterly,Mckinsey Global Institute, October 2011. +[4] DunrenChe, MejdlSafran, and ZhiyongPeng, From Big Data to Big Data Mining: Challenges, Issues, and Opportunities, DASFAA Workshops 2013, LNCS 7827, pp. 1–15, 2013. +[5] MarcinJedyk, MAKING BIG DATA, SMALL, Using distributed systems for processing, analysing and managing large huge data sets, Software Professional‟s Network, Cheshire Data systems Ltd. +[6] OnurSavas, YalinSagduyu, Julia Deng, and Jason Li,Tactical Big Data Analytics: Challenges, Use Cases and Solutions, Big Data Analytics Workshop in conjunction with ACM Sigmetrics 2013,June 21, 2013. +[7] Kyuseok Shim, MapReduce Algorithms for Big Data Analysis, DNIS 2013, LNCS 7813, pp. 44–48, 2013. +[8] Raja.Appuswamy,ChristosGkantsidis,DushyanthNarayanan,OrionHodson,AntonyRowstron, Nobody ever got fired for buying a cluster, Microsoft Research, Cambridge, UK, Technical Report,MSR-TR-2013-2 +[9] Carlos Ordonez, Algorithms and Optimizations for Big Data Analytics: Cubes, Tech Talks,University of Houston, USA. +[10] Spyros Blanas, Jignesh M. Patel,VukErcegovac, Jun Rao,Eugene J. Shekita, YuanyuanTian, A Comparison of Join Algorithms for Log Processing in MapReduce, SIGMOD‟10, June 6–11, 2010, Indianapolis, Indiana, USA. +[11] Tyson Condie, Neil Conway, Peter Alvaro, Joseph M. Hellerstein,JohnGerth, Justin Talbot,KhaledElmeleegy, Russell Sears, Online Aggregation and Continuous Query support in +International Conference on Cloud, Big Data and Trust 2013, Nov 13-15, RGPV +276 +MapReduce, SIGMOD‟10, June 6–11, 2010, Indianapolis, Indiana, USA. +[12] J. Dean and S. Ghemawat, “MapReduce: Simplified data processing on large clusters,” in USENIXSymposium on Operating Systems Design and Implementation, San Francisco, CA, Dec. 2004, pp. 137–150. +[13] S. Ghemawat, H. Gobioff, and S. Leung, “The Google File System.” in ACM Symposium on Operating Systems Principles, Lake George, NY, Oct 2003, pp. 29 – 43. +[14] HADOOP-3759: Provide ability to run memory intensive jobs without affecting other running tasks on the nodes. https://issues.apache.org/jira/browse/HADOOP-3759 +[15] VinayakBorkar, Michael J. Carey, Chen Li, Inside “Big Data Management”:Ogres, Onions, or Parfaits?, EDBT/ICDT 2012 Joint Conference Berlin, Germany,2012 ACM 2012, pp 3-14. +[16] GrzegorzMalewicz, Matthew H. Austern, Aart J. C. Bik, James C.Dehnert, Ilan Horn, NatyLeiser, and GrzegorzCzajkowski,Pregel: A System for Large-Scale Graph Processing, SIGMOD‟10, June 6–11, 2010, pp 135-145. +[17] Hadoop,“PoweredbyHadoop,”http://wiki.apache.org/hadoop/PoweredBy. +[18] PIGTutorial,YahooInc., http://developer.yahoo.com/hadoop/tutorial/pigtutorial.html +[19] Apache: Apache Hadoop, http://hadoop.apache.org +[20] Apache Hive, http://hive.apache.org/ +[21] Apache Giraph Project, http://giraph.apache.org/ +[22] Mahout, http://lucene.apache.org/mahout/ +[23] Amazon Simple Storage Service (Amazon S3). http://aws.amazon.com/s3/ +[24] Windows.Azure.Storage.http://www.microsoft.com/windowsazure/features/storage/ +[25] The Age of Big Data. Steve Lohr. New York Times, Feb 11, 2012. http://www.nytimes.com/2012/02/12/sunday-review/big-datas-impact-in-the-world.html +[26] Information System & Management, ISM Book, 1st Edition 2010, EMC2, Wiley Publishing +[27] Dryad - Microsoft Research, http://research.microsoft.com/en-us/projects/dryad/ +[28] IBM-What.is.Jaql, www.ibm.com/software/data/infosphere/hadoop/jaql/ View publication stats \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..039b248ff187b904751eb0062c2edc367965f076 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Analysis Challenges and Solutions.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt new file mode 100644 index 0000000000000000000000000000000000000000..eccf8ce03f201707c2203ff3c50d6a62208ec746 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt @@ -0,0 +1,43 @@ +MapReduce +Related terms: +Extreme Learning Machine, Cloud Computing, Hadoop, Dataset, Programming +Model +View all Topics +Big Data Management on Wireless Sensor +Networks +In Big Data Analytics for Sensor-Network Collected Intelligence, 2017 +> Read full chapter +Big Data Analytics Challenges and Solutions +In Big Data Analytics for Intelligent Healthcare Management, 2019 +> Read full chapter +Big data principles and paradigm +In Ocean Energy Modeling and Simulation with Big Data, 2020 +> Read full chapter +Extreme Learning Machine and Its Applications +in Big Data Processing +In Big Data Analytics for Sensor-Network Collected Intelligence, 2017 +> Read full chapter +Energy Efficiency in Data Centers and +Clouds +In Advances in Computers, 2016 +> Read full chapter +Climate Analytics as a Service +In Cloud Computing in Ocean and Atmospheric Sciences, 2016 +> Read full chapter +A Deep Dive into NoSQL Databases: +The Use Cases and Applications +In Advances in Computers, 2018 +> Read full chapter +Hadoop in the Cloud to Analyze Climate +Datasets +In Cloud Computing in Ocean and Atmospheric Sciences, 2016 +> Read full chapter +Ocean energy data learning from big +data +In Ocean Energy Modeling and Simulation with Big Data, 2020 +> Read full chapter +Connected Computing Environment +In Advances in Computers, 2013 +> Read full chapter +ScienceDirect is Elsevier’s leading information solution for researchers. +Copyright © 2018 Elsevier B.V. or its licensors or contributors. ScienceDirect ® is a registered trademark of Elsevier B.V. Terms and conditions apply. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..f9484f874a6ec1ea9b327478269900ead4bca34d Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Big Data Management on Wireless Sensor Networks.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0b29020667b0edc30547955405aa2147b0bef79 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt @@ -0,0 +1,469 @@ +MapReduce is a framework using which we can write applications to process huge amounts of data, in parallel, on large clusters of commodity hardware in a reliable manner. + +What is MapReduce? +MapReduce is a processing technique and a program model for distributed computing based on java. The MapReduce algorithm contains two important tasks, namely Map and Reduce. Map takes a set of data and converts it into another set of data, where individual elements are broken down into tuples (key/value pairs). Secondly, reduce task, which takes the output from a map as an input and combines those data tuples into a smaller set of tuples. As the sequence of the name MapReduce implies, the reduce task is always performed after the map job. + +The major advantage of MapReduce is that it is easy to scale data processing over multiple computing nodes. Under the MapReduce model, the data processing primitives are called mappers and reducers. Decomposing a data processing application into mappers and reducers is sometimes nontrivial. But, once we write an application in the MapReduce form, scaling the application to run over hundreds, thousands, or even tens of thousands of machines in a cluster is merely a configuration change. This simple scalability is what has attracted many programmers to use the MapReduce model. + +The Algorithm +Generally MapReduce paradigm is based on sending the computer to where the data resides! + +MapReduce program executes in three stages, namely map stage, shuffle stage, and reduce stage. + +Map stage − The map or mapper’s job is to process the input data. Generally the input data is in the form of file or directory and is stored in the Hadoop file system (HDFS). The input file is passed to the mapper function line by line. The mapper processes the data and creates several small chunks of data. + +Reduce stage − This stage is the combination of the Shuffle stage and the Reduce stage. The Reducer’s job is to process the data that comes from the mapper. After processing, it produces a new set of output, which will be stored in the HDFS. + +During a MapReduce job, Hadoop sends the Map and Reduce tasks to the appropriate servers in the cluster. + +The framework manages all the details of data-passing such as issuing tasks, verifying task completion, and copying data around the cluster between the nodes. + +Most of the computing takes place on nodes with data on local disks that reduces the network traffic. + +After completion of the given tasks, the cluster collects and reduces the data to form an appropriate result, and sends it back to the Hadoop server. + +MapReduce Algorithm +Inputs and Outputs (Java Perspective) +The MapReduce framework operates on pairs, that is, the framework views the input to the job as a set of pairs and produces a set of pairs as the output of the job, conceivably of different types. + +The key and the value classes should be in serialized manner by the framework and hence, need to implement the Writable interface. Additionally, the key classes have to implement the Writable-Comparable interface to facilitate sorting by the framework. Input and Output types of a MapReduce job − (Input) → map → → reduce → (Output). + +Input Output +Map list () +Reduce list () +Terminology +PayLoad − Applications implement the Map and the Reduce functions, and form the core of the job. + +Mapper − Mapper maps the input key/value pairs to a set of intermediate key/value pair. + +NamedNode − Node that manages the Hadoop Distributed File System (HDFS). + +DataNode − Node where data is presented in advance before any processing takes place. + +MasterNode − Node where JobTracker runs and which accepts job requests from clients. + +SlaveNode − Node where Map and Reduce program runs. + +JobTracker − Schedules jobs and tracks the assign jobs to Task tracker. + +Task Tracker − Tracks the task and reports status to JobTracker. + +Job − A program is an execution of a Mapper and Reducer across a dataset. + +Task − An execution of a Mapper or a Reducer on a slice of data. + +Task Attempt − A particular instance of an attempt to execute a task on a SlaveNode. + +Example Scenario +Given below is the data regarding the electrical consumption of an organization. It contains the monthly electrical consumption and the annual average for various years. + +Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Avg +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 +1980 26 27 28 28 28 30 31 31 31 30 30 30 29 +1981 31 32 32 32 33 34 35 36 36 34 34 34 34 +1984 39 38 39 39 39 41 42 43 40 39 38 38 40 +1985 38 39 39 39 39 41 41 41 00 40 39 39 45 +If the above data is given as input, we have to write applications to process it and produce results such as finding the year of maximum usage, year of minimum usage, and so on. This is a walkover for the programmers with finite number of records. They will simply write the logic to produce the required output, and pass the data to the application written. + +But, think of the data representing the electrical consumption of all the largescale industries of a particular state, since its formation. + +When we write applications to process such bulk data, + +They will take a lot of time to execute. + +There will be a heavy network traffic when we move data from source to network server and so on. + +To solve these problems, we have the MapReduce framework. + +Input Data +The above data is saved as sample.txtand given as input. The input file looks as shown below. + +1979 23 23 2 43 24 25 26 26 26 26 25 26 25 +1980 26 27 28 28 28 30 31 31 31 30 30 30 29 +1981 31 32 32 32 33 34 35 36 36 34 34 34 34 +1984 39 38 39 39 39 41 42 43 40 39 38 38 40 +1985 38 39 39 39 39 41 41 41 00 40 39 39 45 +Example Program +Given below is the program to the sample data using MapReduce framework. + +package hadoop; + +import java.util.*; + +import java.io.IOException; +import java.io.IOException; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.conf.*; +import org.apache.hadoop.io.*; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.*; + +public class ProcessUnits { + //Mapper class + public static class E_EMapper extends MapReduceBase implements + Mapper /*Output value Type*/ + { + //Map function + public void map(LongWritable key, Text value, + OutputCollector output, + + Reporter reporter) throws IOException { + String line = value.toString(); + String lasttoken = null; + StringTokenizer s = new StringTokenizer(line,"\t"); + String year = s.nextToken(); + + while(s.hasMoreTokens()) { + lasttoken = s.nextToken(); + } + int avgprice = Integer.parseInt(lasttoken); + output.collect(new Text(year), new IntWritable(avgprice)); + } + } + + //Reducer class + public static class E_EReduce extends MapReduceBase implements Reducer< Text, IntWritable, Text, IntWritable > { + + //Reduce function + public void reduce( Text key, Iterator values, + OutputCollector output, Reporter reporter) throws IOException { + int maxavg = 30; + int val = Integer.MIN_VALUE; + + while (values.hasNext()) { + if((val = values.next().get())>maxavg) { + output.collect(key, new IntWritable(val)); + } + } + } + } + + //Main function + public static void main(String args[])throws Exception { + JobConf conf = new JobConf(ProcessUnits.class); + + conf.setJobName("max_eletricityunits"); + conf.setOutputKeyClass(Text.class); + conf.setOutputValueClass(IntWritable.class); + conf.setMapperClass(E_EMapper.class); + conf.setCombinerClass(E_EReduce.class); + conf.setReducerClass(E_EReduce.class); + conf.setInputFormat(TextInputFormat.class); + conf.setOutputFormat(TextOutputFormat.class); + + FileInputFormat.setInputPaths(conf, new Path(args[0])); + FileOutputFormat.setOutputPath(conf, new Path(args[1])); + + JobClient.runJob(conf); + } +} +Save the above program as ProcessUnits.java. The compilation and execution of the program is explained below. + +Compilation and Execution of Process Units Program +Let us assume we are in the home directory of a Hadoop user (e.g. /home/hadoop). + +Follow the steps given below to compile and execute the above program. + +Step 1 +The following command is to create a directory to store the compiled java classes. + +$ mkdir units +Step 2 +Download Hadoop-core-1.2.1.jar, which is used to compile and execute the MapReduce program. Visit the following link mvnrepository.com to download the jar. Let us assume the downloaded folder is /home/hadoop/. + +Step 3 +The following commands are used for compiling the ProcessUnits.java program and creating a jar for the program. + +$ javac -classpath hadoop-core-1.2.1.jar -d units ProcessUnits.java +$ jar -cvf units.jar -C units/ . +Step 4 +The following command is used to create an input directory in HDFS. + +$HADOOP_HOME/bin/hadoop fs -mkdir input_dir +Step 5 +The following command is used to copy the input file named sample.txtin the input directory of HDFS. + +$HADOOP_HOME/bin/hadoop fs -put /home/hadoop/sample.txt input_dir +Step 6 +The following command is used to verify the files in the input directory. + +$HADOOP_HOME/bin/hadoop fs -ls input_dir/ +Step 7 +The following command is used to run the Eleunit_max application by taking the input files from the input directory. + +$HADOOP_HOME/bin/hadoop jar units.jar hadoop.ProcessUnits input_dir output_dir +Wait for a while until the file is executed. After execution, as shown below, the output will contain the number of input splits, the number of Map tasks, the number of reducer tasks, etc. + +INFO mapreduce.Job: Job job_1414748220717_0002 +completed successfully +14/10/31 06:02:52 +INFO mapreduce.Job: Counters: 49 + File System Counters + +FILE: Number of bytes read = 61 +FILE: Number of bytes written = 279400 +FILE: Number of read operations = 0 +FILE: Number of large read operations = 0 +FILE: Number of write operations = 0 +HDFS: Number of bytes read = 546 +HDFS: Number of bytes written = 40 +HDFS: Number of read operations = 9 +HDFS: Number of large read operations = 0 +HDFS: Number of write operations = 2 Job Counters + + + Launched map tasks = 2 + Launched reduce tasks = 1 + Data-local map tasks = 2 + Total time spent by all maps in occupied slots (ms) = 146137 + Total time spent by all reduces in occupied slots (ms) = 441 + Total time spent by all map tasks (ms) = 14613 + Total time spent by all reduce tasks (ms) = 44120 + Total vcore-seconds taken by all map tasks = 146137 + Total vcore-seconds taken by all reduce tasks = 44120 + Total megabyte-seconds taken by all map tasks = 149644288 + Total megabyte-seconds taken by all reduce tasks = 45178880 + +Map-Reduce Framework + + Map input records = 5 + Map output records = 5 + Map output bytes = 45 + Map output materialized bytes = 67 + Input split bytes = 208 + Combine input records = 5 + Combine output records = 5 + Reduce input groups = 5 + Reduce shuffle bytes = 6 + Reduce input records = 5 + Reduce output records = 5 + Spilled Records = 10 + Shuffled Maps = 2 + Failed Shuffles = 0 + Merged Map outputs = 2 + GC time elapsed (ms) = 948 + CPU time spent (ms) = 5160 + Physical memory (bytes) snapshot = 47749120 + Virtual memory (bytes) snapshot = 2899349504 + Total committed heap usage (bytes) = 277684224 + +File Output Format Counters + + Bytes Written = 40 +Step 8 +The following command is used to verify the resultant files in the output folder. + +$HADOOP_HOME/bin/hadoop fs -ls output_dir/ +Step 9 +The following command is used to see the output in Part-00000 file. This file is generated by HDFS. + +$HADOOP_HOME/bin/hadoop fs -cat output_dir/part-00000 +Below is the output generated by the MapReduce program. + +1981 34 +1984 40 +1985 45 +Step 10 +The following command is used to copy the output folder from HDFS to the local file system for analyzing. + +$HADOOP_HOME/bin/hadoop fs -cat output_dir/part-00000/bin/hadoop dfs get output_dir /home/hadoop +Important Commands +All Hadoop commands are invoked by the $HADOOP_HOME/bin/hadoop command. Running the Hadoop script without any arguments prints the description for all commands. + +Usage − hadoop [--config confdir] COMMAND + +The following table lists the options available and their description. + +Sr.No. Option & Description +1 +namenode -format + +Formats the DFS filesystem. + +2 +secondarynamenode + +Runs the DFS secondary namenode. + +3 +namenode + +Runs the DFS namenode. + +4 +datanode + +Runs a DFS datanode. + +5 +dfsadmin + +Runs a DFS admin client. + +6 +mradmin + +Runs a Map-Reduce admin client. + +7 +fsck + +Runs a DFS filesystem checking utility. + +8 +fs + +Runs a generic filesystem user client. + +9 +balancer + +Runs a cluster balancing utility. + +10 +oiv + +Applies the offline fsimage viewer to an fsimage. + +11 +fetchdt + +Fetches a delegation token from the NameNode. + +12 +jobtracker + +Runs the MapReduce job Tracker node. + +13 +pipes + +Runs a Pipes job. + +14 +tasktracker + +Runs a MapReduce task Tracker node. + +15 +historyserver + +Runs job history servers as a standalone daemon. + +16 +job + +Manipulates the MapReduce jobs. + +17 +queue + +Gets information regarding JobQueues. + +18 +version + +Prints the version. + +19 +jar + +Runs a jar file. + +20 +distcp + +Copies file or directories recursively. + +21 +distcp2 + +DistCp version 2. + +22 +archive -archiveName NAME -p * + +Creates a hadoop archive. + +23 +classpath + +Prints the class path needed to get the Hadoop jar and the required libraries. + +24 +daemonlog + +Get/Set the log level for each daemon + +How to Interact with MapReduce Jobs +Usage − hadoop job [GENERIC_OPTIONS] + +The following are the Generic Options available in a Hadoop job. + +Sr.No. GENERIC_OPTION & Description +1 +-submit + +Submits the job. + +2 +-status + +Prints the map and reduce completion percentage and all job counters. + +3 +-counter + +Prints the counter value. + +4 +-kill + +Kills the job. + +5 +-events <#-of-events> + +Prints the events' details received by jobtracker for the given range. + +6 +-history [all] - history < jobOutputDir> + +Prints job details, failed and killed tip details. More details about the job such as successful tasks and task attempts made for each task can be viewed by specifying the [all] option. + +7 +-list[all] + +Displays all jobs. -list displays only jobs which are yet to complete. + +8 +-kill-task + +Kills the task. Killed tasks are NOT counted against failed attempts. + +9 +-fail-task + +Fails the task. Failed tasks are counted against failed attempts. + +10 +-set-priority + +Changes the priority of the job. Allowed priority values are VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW + +To see the status of job +$ $HADOOP_HOME/bin/hadoop job -status +e.g. +$ $HADOOP_HOME/bin/hadoop job -status job_201310191043_0004 +To see the history of job output-dir +$ $HADOOP_HOME/bin/hadoop job -history +e.g. +$ $HADOOP_HOME/bin/hadoop job -history /user/expert/output +To kill the job +$ $HADOOP_HOME/bin/hadoop job -kill +e.g. +$ $HADOOP_HOME/bin/hadoop job -kill job_201310191043_0004 \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..412406ce0d8413362275e906413495ce1ebadd88 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop - MapReduce.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" new file mode 100644 index 0000000000000000000000000000000000000000..280390eb0afe3f18e843905c8d1d94dcb004b05e --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt" @@ -0,0 +1,161 @@ +Hadoop Architecture in Detail – HDFS, Yarn & MapReduce +Boost your career with Big Data Get Exclusive Offers on Big Data Course!! +Hadoop now has become a popular solution for today’s world needs. The design of Hadoop keeps various goals in mind. These are fault tolerance, handling of large datasets, data locality, portability across heterogeneous hardware and software platforms etc. In this blog, we will explore the Hadoop Architecture in detail. Also, we will see Hadoop Architecture Diagram that helps you to understand it better. + +So, let’s explore Hadoop Architecture. + +Hadoop Architecture in Detail - HDFS, Yarn & MapReduce + +What is Hadoop Architecture? +Hadoop has a master-slave topology. In this topology, we have one master node and multiple slave nodes. Master node’s function is to assign a task to various slave nodes and manage resources. The slave nodes do the actual computing. Slave nodes store the real data whereas on master we have metadata. This means it stores data about data. What does metadata comprise that we will see in a moment? + +Hadoop Application Architecture in Detail + +Hadoop Architecture comprises three major layers. They are:- + +HDFS (Hadoop Distributed File System) +Yarn +MapReduce +1. HDFS +HDFS stands for Hadoop Distributed File System. It provides for data storage of Hadoop. HDFS splits the data unit into smaller units called blocks and stores them in a distributed manner. It has got two daemons running. One for master node – NameNode and other for slave nodes – DataNode. + +a. NameNode and DataNode +HDFS has a Master-slave architecture. The daemon called NameNode runs on the master server. It is responsible for Namespace management and regulates file access by the client. DataNode daemon runs on slave nodes. It is responsible for storing actual business data. Internally, a file gets split into a number of data blocks and stored on a group of slave machines. Namenode manages modifications to file system namespace. These are actions like the opening, closing and renaming files or directories. NameNode also keeps track of mapping of blocks to DataNodes. This DataNodes serves read/write request from the file system’s client. DataNode also creates, deletes and replicates blocks on demand from NameNode. + + + +Hadoop Architecture Diagram + +Java is the native language of HDFS. Hence one can deploy DataNode and NameNode on machines having Java installed. In a typical deployment, there is one dedicated machine running NameNode. And all the other nodes in the cluster run DataNode. The NameNode contains metadata like the location of blocks on the DataNodes. And arbitrates resources among various competing DataNodes. + +You must read about Hadoop High Availability Concept + +b. Block in HDFS +Block is nothing but the smallest unit of storage on a computer system. It is the smallest contiguous storage allocated to a file. In Hadoop, we have a default block size of 128MB or 256 MB. + +Hadoop Architecture Diagram + +One should select the block size very carefully. To explain why so let us take an example of a file which is 700MB in size. If our block size is 128MB then HDFS divides the file into 6 blocks. Five blocks of 128MB and one block of 60MB. What will happen if the block is of size 4KB? But in HDFS we would be having files of size in the order terabytes to petabytes. With 4KB of the block size, we would be having numerous blocks. This, in turn, will create huge metadata which will overload the NameNode. Hence we have to choose our HDFS block size judiciously. + +c. Replication Management +To provide fault tolerance HDFS uses a replication technique. In that, it makes copies of the blocks and stores in on different DataNodes. Replication factor decides how many copies of the blocks get stored. It is 3 by default but we can configure to any value. + +Hadoop Replication Factor + +The above figure shows how the replication technique works. Suppose we have a file of 1GB then with a replication factor of 3 it will require 3GBs of total storage. + +To maintain the replication factor NameNode collects block report from every DataNode. Whenever a block is under-replicated or over-replicated the NameNode adds or deletes the replicas accordingly. + +d. What is Rack Awareness? +Hadoop Architecture + +A rack contains many DataNode machines and there are several such racks in the production. HDFS follows a rack awareness algorithm to place the replicas of the blocks in a distributed fashion. This rack awareness algorithm provides for low latency and fault tolerance. Suppose the replication factor configured is 3. Now rack awareness algorithm will place the first block on a local rack. It will keep the other two blocks on a different rack. It does not store more than two blocks in the same rack if possible. + +2. MapReduce +MapReduce is the data processing layer of Hadoop. It is a software framework that allows you to write applications for processing a large amount of data. MapReduce runs these applications in parallel on a cluster of low-end machines. It does so in a reliable and fault-tolerant manner. + +MapReduce job comprises a number of map tasks and reduces tasks. Each task works on a part of data. This distributes the load across the cluster. The function of Map tasks is to load, parse, transform and filter data. Each reduce task works on the sub-set of output from the map tasks. Reduce task applies grouping and aggregation to this intermediate data from the map tasks. + +The input file for the MapReduce job exists on HDFS. The inputformat decides how to split the input file into input splits. Input split is nothing but a byte-oriented view of the chunk of the input file. This input split gets loaded by the map task. The map task runs on the node where the relevant data is present. The data need not move over the network and get processed locally. + +Hadoop Architecture - MapReduce + +i. Map Task +The Map task run in the following phases:- + +a. RecordReader +The recordreader transforms the input split into records. It parses the data into records but does not parse records itself. It provides the data to the mapper function in key-value pairs. Usually, the key is the positional information and value is the data that comprises the record. + +b. Map +In this phase, the mapper which is the user-defined function processes the key-value pair from the recordreader. It produces zero or multiple intermediate key-value pairs. + +The decision of what will be the key-value pair lies on the mapper function. The key is usually the data on which the reducer function does the grouping operation. And value is the data which gets aggregated to get the final result in the reducer function. + +c. Combiner +The combiner is actually a localized reducer which groups the data in the map phase. It is optional. Combiner takes the intermediate data from the mapper and aggregates them. It does so within the small scope of one mapper. In many situations, this decreases the amount of data needed to move over the network. For example, moving (Hello World, 1) three times consumes more network bandwidth than moving (Hello World, 3). Combiner provides extreme performance gain with no drawbacks. The combiner is not guaranteed to execute. Hence it is not of overall algorithm. + +d. Partitioner +Partitioner pulls the intermediate key-value pairs from the mapper. It splits them into shards, one shard per reducer. By default, partitioner fetches the hashcode of the key. The partitioner performs modulus operation by a number of reducers: key.hashcode()%(number of reducers). This distributes the keyspace evenly over the reducers. It also ensures that key with the same value but from different mappers end up into the same reducer. The partitioned data gets written on the local file system from each map task. It waits there so that reducer can pull it. + +b. Reduce Task +The various phases in reduce task are as follows: + +i. Shuffle and Sort +The reducer starts with shuffle and sort step. This step downloads the data written by partitioner to the machine where reducer is running. This step sorts the individual data pieces into a large data list. The purpose of this sort is to collect the equivalent keys together. The framework does this so that we could iterate over it easily in the reduce task. This phase is not customizable. The framework handles everything automatically. However, the developer has control over how the keys get sorted and grouped through a comparator object. + +ii. Reduce +The reducer performs the reduce function once per key grouping. The framework passes the function key and an iterator object containing all the values pertaining to the key. + +We can write reducer to filter, aggregate and combine data in a number of different ways. Once the reduce function gets finished it gives zero or more key-value pairs to the outputformat. Like map function, reduce function changes from job to job. As it is the core logic of the solution. + +iii. OutputFormat +This is the final step. It takes the key-value pair from the reducer and writes it to the file by recordwriter. By default, it separates the key and value by a tab and each record by a newline character. We can customize it to provide richer output format. But none the less final data gets written to HDFS. + +Hadoop MapReduce Architecture Diagram + +3. YARN +YARN or Yet Another Resource Negotiator is the resource management layer of Hadoop. The basic principle behind YARN is to separate resource management and job scheduling/monitoring function into separate daemons. In YARN there is one global ResourceManager and per-application ApplicationMaster. An Application can be a single job or a DAG of jobs. + +Inside the YARN framework, we have two daemons ResourceManager and NodeManager. The ResourceManager arbitrates resources among all the competing applications in the system. The job of NodeManger is to monitor the resource usage by the container and report the same to ResourceManger. The resources are like CPU, memory, disk, network and so on. + +The ApplcationMaster negotiates resources with ResourceManager and works with NodeManger to execute and monitor the job. + +Hadoop Architecture + +The ResourceManger has two important components – Scheduler and ApplicationManager + +i. Scheduler +Scheduler is responsible for allocating resources to various applications. This is a pure scheduler as it does not perform tracking of status for the application. It also does not reschedule the tasks which fail due to software or hardware errors. The scheduler allocates the resources based on the requirements of the applications. + +ii. Application Manager +Following are the functions of ApplicationManager + +Accepts job submission. +Negotiates the first container for executing ApplicationMaster. A container incorporates elements such as CPU, memory, disk, and network. +Restarts the ApplicationMaster container on failure. +Functions of ApplicationMaster:- + +Negotiates resource container from Scheduler. +Tracks the resource container status. +Monitors progress of the application. +We can scale the YARN beyond a few thousand nodes through YARN Federation feature. This feature enables us to tie multiple YARN clusters into a single massive cluster. This allows for using independent clusters, clubbed together for a very large job. + +iii. Features of Yarn +YARN has the following features:- + +a. Multi-tenancy + +YARN allows a variety of access engines (open-source or propriety) on the same Hadoop data set. These access engines can be of batch processing, real-time processing, iterative processing and so on. + +b. Cluster Utilization + +With the dynamic allocation of resources, YARN allows for good use of the cluster. As compared to static map-reduce rules in previous versions of Hadoop which provides lesser utilization of the cluster. + +c. Scalability + +Any data center processing power keeps on expanding. YARN’s ResourceManager focuses on scheduling and copes with the ever-expanding cluster, processing petabytes of data. + +d. Compatibility + +MapReduce program developed for Hadoop 1.x can still on this YARN. And this is without any disruption to processes that already work. + +Best Practices For Hadoop Architecture Design +i. Embrace Redundancy Use Commodity Hardware +Many companies venture into Hadoop by business users or analytics group. The infrastructure folks peach in later. These people often have no idea about Hadoop. The result is the over-sized cluster which increases the budget many folds. Hadoop was mainly created for availing cheap storage and deep data analysis. To achieve this use JBOD i.e. Just a Bunch Of Disk. Also, use a single power supply. + +ii. Start Small and Keep Focus +Many projects fail because of their complexity and expense. To avoid this start with a small cluster of nodes and add nodes as you go along. Start with a small project so that infrastructure and development guys can understand the internal working of Hadoop. + +iii. Create Procedure For Data Integration +One of the features of Hadoop is that it allows dumping the data first. And we can define the data structure later. We can get data easily with tools such as Flume and Sqoop. But it is essential to create a data integration process. This includes various layers such as staging, naming standards, location etc. Make proper documentation of data sources and where they live in the cluster. + +iv. Use Compression Technique +Enterprise has a love-hate relationship with compression. There is a trade-off between performance and storage. Although compression decreases the storage used it decreases the performance too. But Hadoop thrives on compression. It can increase storage usage by 80%. + +v. Create Multiple Environments +It is a best practice to build multiple environments for development, testing, and production. As Apache Hadoop has a wide ecosystem, different projects in it have different requirements. Hence there is a need for a non-production environment for testing upgrades and new functionalities. + +Summary +Hence, in this Hadoop Application Architecture, we saw the design of Hadoop Architecture is such that it recovers itself whenever needed. Its redundant storage structure makes it fault-tolerant and robust. We are able to scale the system linearly. The MapReduce part of the design works on the principle of data locality. The Map-Reduce framework moves the computation close to the data. Therefore decreasing network traffic which would otherwise have consumed major bandwidth for moving large datasets. Thus overall architecture of Hadoop makes it economical, scalable and efficient big data technology. + +Hadoop Architecture is a very important topic for your Hadoop Interview. We recommend you to once check most asked Hadoop Interview questions. You will get many questions from Hadoop Architecture. \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..7c6456bafba575d9246b014e810f8c2d5bd50418 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop Architecture in Detail \342\200\223 HDFS, Yarn & MapReduce.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt new file mode 100644 index 0000000000000000000000000000000000000000..de93cfefd6ca4e0ff0946228aeae39ee290e4d71 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt @@ -0,0 +1,21 @@ +MapReduce +Hadoop MapReduce- Java-based Processing Framework for Big Data +MapReduce rules the roost for massive scale big data processing on Hadoop. The highest unit of work in Hadoop MapReduce is a Job. MapReduce programming paradigm uses a two-step data analysis process- Map Stage and Reduce Stage (reduce phase is optional). The map stage takes a set of data and converts it into another set where data elements are broken down into key-value pairs or tuples. Reduce job takes the output of the map function and combines them into smaller set of tuples or key-value pairs. The reduce job is always performed when the map job is completed - hence the sequence of the name MapReduce. + +MapReduce Overview +MapReduce Terminologies +MapReduce Life Cycle +MapReduce Advantages +MapReduce Blogs +MapReduce Tutorials +MapReduce Interview Questions +MapReduce Slides +MapReduce Videos +MapReduce Questions & Answers +MapReduce Assignments + +MapReduce Terminologies +Job - It is the complete process to execute including the mappers, the input, the reducers and the output across a particular dataset. +Task - Every job is divided into several mappers and reducers. A portion of the job executed on a slice of data can be referred to as a task. +JobTracker - It is the master node for managing all the jobs and resources in a hadoop cluster. +TaskTracker - These are the agents deployed to each machine in the hadoop cluster to run Map and Reduce tasks and then report the status to the JobTracker after execution. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..d4157162478f4d119ce961fc0682e28d56797baa Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Hadoop MapReduce- Java-based Processing Framework for Big Data.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt new file mode 100644 index 0000000000000000000000000000000000000000..c08e74f85bc6402cf53cb874c6a6c2c095431aeb --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt @@ -0,0 +1,48 @@ +Introduction to MapReduce Architecture +Hadoop cluster stores a large set of data which is parallelly processed mainly by MapReduce. Firstly, it was just a thesis that Google designed. That provides parallelism, fault-tolerance, and data distribution. For processing huge chunks of data, MapReduce comes into the picture. Map Reduce provides API with features such as parallel processing of huge amounts of data, batch processing, and high availability. Map Reduce programs are written by programmers when there is a need for an application for business scenarios. The development of applications and deployment across Hadoop clusters is done by the programmers when they understand the flow pattern of MapReduce. + +Explanation of MapReduce Architecture +Hadoop can be developed in programming languages like Python and C++. MapReduce Hadoop is a software framework for ease in writing applications of software processing huge amounts of data. MapReduce is a framework which splits the chunk of data, sorts the map outputs and input to reduce tasks. A File-system stores the work and input of jobs. Re-execution of failed tasks, scheduling them, and monitoring them is the task of the framework. + +The architecture of MapReduce basically has two main processing stages, and those are Map and Reduce. The MapReduce happens in Job tracker. Intermediate processes will take place in between the Map and Reduce phases. Sort and shuffle are the tasks taken up by Map and Reduce, which are done intermediate. The local file system stores the intermediate data. + +Map() Function: Create and process the import data. Takes in data, converts it into a set of other data where the breakdown of individual elements into tuples is done—no API contract requiring a certain number of outputs. +Reduce() Function: Mappers output is passed into the reduction. Processes the data into something usable. Every single mapper is passed into the reduced function. The new output values are saved into HDFS. +MapReduce Architecture Components +Below is the explanation of components of MapReduce architecture: + +MapReduce1 +1. Map Phase +Map phase splits the input data into two parts. They are Keys and Values. Writable and comparable is the key in the processing stage where only in the processing stage, Value is writable. Let’s say a client gives input data to a Hadoop system; task tracker is assigned tasks by job tracker. Splitting of input is done into several inputs. Key-value pair conversion is done with the input data by the record reader. This is the actual data input for Map as in mapped information for further processing. The format type varies, so the coder has to look into each piece of data format and code accordingly. + +Mini reducer which is commonly called a combiner, the reducer code places input as the combiner. Network bandwidth is high when a huge amount of data is required. Hash is the default partition used. The partition module plays a key role in Hadoop. More performance is given by reducing the pressure by petitioner on the reducer. + +2. Processing in Intermediate +In the intermediate phase, the map input gets into the sort and shuffle phase. Hadoop nodes do not have replications where all the intermediate data is stored in a local file system. Round – robin data is used by Hadoop to write to local disk, the intermediate data. There are other shuffles and sort factors to be considered to reach the condition of writing the data to local disks. + +3. Reducer Phase +The reducer takes in the data input that is sorted and shuffled. All the input data will be combined, and similar key-value pairs are to be written to the hdfs system. For searching and mapping purposes, a reducer is not always necessary. Setting some properties for enabling to develop of the number of reducers for each task. During job processing, speculative execution plays a prominent role. The performance is FIFO that is first in first out, and if more than one mapper is working on similar data, and if one is running slow, then the tasks are assigned to the next mapper for a fast program run by the job tracker. + + Popular Course in this category +MapReduce Training (2 Courses, 4+ Projects) +2 Online Courses | 4 Hands-on Projects | 19+ Hours | Verifiable Certificate of Completion | Lifetime Access +4.5 (6,176 ratings)Course Price +$79 $399 +View Course +Related Courses +Data Scientist Training (76 Courses, 60+ Projects)Machine Learning Training (17 Courses, 27+ Projects)Hadoop Training Program (20 Courses, 14+ Projects, 4 Quizzes) +MapReduce2 +This is how MapReduce organizers work. + +The job is divided into two components: Map tasks (Splits and mapping) and Reduce tasks (Reducing and shuffling). +The above picture says that Job tracker is associated with complete execution of a given job, by behaving like a master. Whereas, the Multiple task trackers act like slaves by performing the job each. +Conclusion +Imagine you have lots of documents, which is huge data. And you need to count the number of occurrences of each word throughout the documents. I might seem like an arbitrary task, but the basic idea is that let’s say you have a lot of web pages and you want to make them available for search queries. The reducer does aggregation of data, and it consists of all the keys and combines them all for similar key-value pairs which is basically the Hadoop shuffling process. + +Recommended Articles +This is a guide to MapReduce Architecture. Here we discuss an introduction to MapReduce Architecture, explanation of components of the architecture in detail. You can also go through our other related articles to learn more – + +Mapreduce Combiner +How MapReduce Works +What is MapReduce in Hadoop? +MapReduce Algorithms diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..baa4ea40dea9c37485d5dd346501ceb9d73e882c Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture1.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt new file mode 100644 index 0000000000000000000000000000000000000000..c29fcc12b99fe2dbf843cee6167124c5029762d0 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt @@ -0,0 +1,72 @@ +MapReduce is a programming model and expectation is parallel processing in Hadoop. MapReduce makes easy to distribute tasks across nodes and performs Sort or Merge based on distributed computing. + +The underlying system takes care of partitioning input data, scheduling the programs execution across several machines, handling machine failures and managing inter-machine communication. + +Input will be divided into multiple chunks/blocks. Each and every chunk/block of data will be processed in different nodes. MapReduce architecture contains the below phases - + +Input Files +InputFormat +InputSplit +RecordReader +Mapper +Combiner +Partitioner +Shuffling and Sorting +Reducer +RecordWriter +OutputFormat +Input Files - +In general, the input data to process using MapReduce task is stored in input files. These input files typically reside in HDFS (Hadoop Distributed File System). The format of these files is random where other formats like binary or log files can also be used. + +InputFormat - +InputFormat describes the input-specification for a Map-Reduce job. InputFormat defines how the input files are to split and read. InputFormat selects the files or other objects used for input. + +InputFormat creates InputSplit from the selected input files. InputFormat split the input into logical InputSplits based on the total size, in bytes of the input files. + +InputSplit - +InputSplit is created by InputFormat. InputSplit logically represents the data to be processed by an individual Mapper. One map task is created to process one InputSplit. + +The number of map tasks normally equals to the number of InputSplits. The InputSplit is divided into input records and each record is processed by the specific mapper assigned to process the InputSplit. InputSplit presents a byte-oriented view on the input. + +RecordReader - +RecordReader communicates with the InputSplit in Hadoop MapReduce. RecordReader reads pairs from an InputSplit. RecordReader converts the byte-oriented view of the input from the InputSplit. + +RecordReader provides a record-oriented view of the input data for mapper and reducer tasks processing. RecordReader converts the data into key-value pairs suitable for reading by the mapper. + +RecordReader communicates with the InputSplit until the file reading is not completed. Once the file reading completed, these key-value pairs are sent to the mapper for further processing. + +Mapper - +Mapper processes each input record and generates new key-value pair. Mapper generated key-value pair is completely different from the input key-value pair. The mapper output is called as intermediate output. + +The mapper output is not written to local disk because of it creates unnecessary copies. Mappers output is passed to the combiner for further process. + +Map takes a set of data and converts it into another set of data, where individual elements are broken down into key pairs. The Mapper reads the data in the form of key/value pairs and outputs zero or more key/value pairs. + +Combiner - +Combiner acts as a mini reducer in MapReduce framework. This is an optional class provided in MapReduce driver class. Combiner process the output of map tasks and sends it to the Reducer. + +For every mapper, there will be one Combiner. Combiners are treated as local reducers. Hadoop does not provide any guarantee on combiner’s execution. + +Hadoop may not call combiner function if it is not required. Hadoop may call one or many times for a map output based on the requirement. + +Partitioner - +Partitioner allows distributing how outputs from the map stage are send to the reducers. Partitioner controls the keys partition of the intermediate map-outputs. The key or a subset of the key is used to derive the partition by a hash function. + +The total number of partitions is almost same as the number of reduce tasks for the job. Partitioner runs on the same machine where the mapper had completed its execution by consuming the mapper output. Entire mapper output sent to partitioner. + +Partitioner forms number of reduce task groups from the mapper output. By default, Hadoop framework is hash based partitioner. The Hash partitioner partitions the key space by using the hash code. + +Shuffling and Sorting - +The output of the partitioner is Shuffled to the reduce node. The shuffling is the physical movement of the data over the network. Once the mappers finished their process, the output produced are shuffled on reducer nodes. + +The mapper output is called as intermediate output and it is merged and then sorted. The sorted output is provided as a input to the reducer phase. + +Reducer - +After the map phase is over, all the intermediate values for the intermediate keys are combined into a list. Reducer task, which takes the output from a mapper as an input and combines those data tuples into a smaller set of tuples. There may be single reducer, multiple reducers. + +All the values associated with an intermediate key are guaranteed to go to the same reducer. The intermediate key and their value lists are passed to the reducer in sorted key order. The reducer outputs zero or more final key/value pairs and these are written to HDFS. + +RecordWriter & OutputFormat - +RecordWriter writes these output key-value pair from the Reducer phase to the output files. The way of writing the output key-value pairs to output files by RecordWriter is determined by the OutputFormat. + +OutputFormat instances provided by the Hadoop are used to write files in HDFS or on the local disk. The final output of reducer is written on HDFS by OutputFormat instances. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..ac66005418b2b4a5b401b9a494edc4774ff401ce Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture2.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt new file mode 100644 index 0000000000000000000000000000000000000000..794ecd1aeec1f713493b7a669dd4e73d917e708e --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt @@ -0,0 +1,32 @@ +MapReduce Architecture +Last Updated : 10 Sep, 2020 +MapReduce and HDFS are the two major components of Hadoop which makes it so powerful and efficient to use. MapReduce is a programming model used for efficient processing in parallel over large data-sets in a distributed manner. The data is first split and then combined to produce the final result. The libraries for MapReduce is written in so many programming languages with various different-different optimizations. The purpose of MapReduce in Hadoop is to Map each of the jobs and then it will reduce it to equivalent tasks for providing less overhead over the cluster network and to reduce the processing power. The MapReduce task is mainly divided into two phases Map Phase and Reduce Phase. + +MapReduce Architecture: + +MapReduce-Architecture + +Components of MapReduce Architecture: +Client: The MapReduce client is the one who brings the Job to the MapReduce for processing. There can be multiple clients available that continuously send jobs for processing to the Hadoop MapReduce Manager. +Job: The MapReduce Job is the actual work that the client wanted to do which is comprised of so many smaller tasks that the client wants to process or execute. +Hadoop MapReduce Master: It divides the particular job into subsequent job-parts. +Job-Parts: The task or sub-jobs that are obtained after dividing the main job. The result of all the job-parts combined to produce the final output. +Input Data: The data set that is fed to the MapReduce for processing. +Output Data: The final result is obtained after the processing. +In MapReduce, we have a client. The client will submit the job of a particular size to the Hadoop MapReduce Master. Now, the MapReduce master will divide this job into further equivalent job-parts. These job-parts are then made available for the Map and Reduce Task. This Map and Reduce task will contain the program as per the requirement of the use-case that the particular company is solving. The developer writes their logic to fulfill the requirement that the industry requires. The input data which we are using is then fed to the Map Task and the Map will generate intermediate key-value pair as its output. The output of Map i.e. these key-value pairs are then fed to the Reducer and the final output is stored on the HDFS. There can be n number of Map and Reduce tasks made available for processing the data as per the requirement. The algorithm for Map and Reduce is made with a very optimized way such that the time complexity or space complexity is minimum. + +Let’s discuss the MapReduce phases to get a better understanding of its architecture: + + + +The MapReduce task is mainly divided into 2 phases i.e. Map phase and Reduce phase. + +Map: As the name suggests its main use is to map the input data in key-value pairs. The input to the map may be a key-value pair where the key can be the id of some kind of address and value is the actual value that it keeps. The Map() function will be executed in its memory repository on each of these input key-value pairs and generates the intermediate key-value pair which works as input for the Reducer or Reduce() function. + +Reduce: The intermediate key-value pairs that work as input for Reducer are shuffled and sort and send to the Reduce() function. Reducer aggregate or group the data based on its key-value pair as per the reducer algorithm written by the developer. +How Job tracker and the task tracker deal with MapReduce: + +Job Tracker: The work of Job tracker is to manage all the resources and all the jobs across the cluster and also to schedule each map on the Task Tracker running on the same data node since there can be hundreds of data nodes available in the cluster. + +Task Tracker: The Task Tracker can be considered as the actual slaves that are working on the instruction given by the Job Tracker. This Task Tracker is deployed on each of the nodes available in the cluster that executes the Map and Reduce task as instructed by Job Tracker. +There is also one important component of MapReduce Architecture known as Job History Server. The Job History Server is a daemon process that saves and stores historical information about the task or application, like the logs which are generated during or after the job execution are stored on Job History Server. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..c3dc202614b46ce1876f001c18840b28abe7485d Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Architecture3.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5c558ffa97667ffdd0579e7dc81ebfa0866d610 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt @@ -0,0 +1,853 @@ +Purpose +This document comprehensively describes all user-facing facets of the Hadoop MapReduce framework and serves as a tutorial. + +Prerequisites +Ensure that Hadoop is installed, configured and is running. More details: + +Single Node Setup for first-time users. +Cluster Setup for large, distributed clusters. +Overview +Hadoop MapReduce is a software framework for easily writing applications which process vast amounts of data (multi-terabyte data-sets) in-parallel on large clusters (thousands of nodes) of commodity hardware in a reliable, fault-tolerant manner. + +A MapReduce job usually splits the input data-set into independent chunks which are processed by the map tasks in a completely parallel manner. The framework sorts the outputs of the maps, which are then input to the reduce tasks. Typically both the input and the output of the job are stored in a file-system. The framework takes care of scheduling tasks, monitoring them and re-executes the failed tasks. + +Typically the compute nodes and the storage nodes are the same, that is, the MapReduce framework and the Hadoop Distributed File System (see HDFS Architecture Guide) are running on the same set of nodes. This configuration allows the framework to effectively schedule tasks on the nodes where data is already present, resulting in very high aggregate bandwidth across the cluster. + +The MapReduce framework consists of a single master JobTracker and one slave TaskTracker per cluster-node. The master is responsible for scheduling the jobs' component tasks on the slaves, monitoring them and re-executing the failed tasks. The slaves execute the tasks as directed by the master. + +Minimally, applications specify the input/output locations and supply map and reduce functions via implementations of appropriate interfaces and/or abstract-classes. These, and other job parameters, comprise the job configuration. The Hadoop job client then submits the job (jar/executable etc.) and configuration to the JobTracker which then assumes the responsibility of distributing the software/configuration to the slaves, scheduling tasks and monitoring them, providing status and diagnostic information to the job-client. + +Although the Hadoop framework is implemented in JavaTM, MapReduce applications need not be written in Java. + +Hadoop Streaming is a utility which allows users to create and run jobs with any executables (e.g. shell utilities) as the mapper and/or the reducer. +Hadoop Pipes is a SWIG- compatible C++ API to implement MapReduce applications (non JNITM based). +Inputs and Outputs +The MapReduce framework operates exclusively on pairs, that is, the framework views the input to the job as a set of pairs and produces a set of pairs as the output of the job, conceivably of different types. + +The key and value classes have to be serializable by the framework and hence need to implement the Writable interface. Additionally, the key classes have to implement the WritableComparable interface to facilitate sorting by the framework. + +Input and Output types of a MapReduce job: + +(input) -> map -> -> combine -> -> reduce -> (output) + +Example: WordCount v1.0 +Before we jump into the details, lets walk through an example MapReduce application to get a flavour for how they work. + +WordCount is a simple application that counts the number of occurences of each word in a given input set. + +This works with a local-standalone, pseudo-distributed or fully-distributed Hadoop installation (Single Node Setup). + +Source Code +WordCount.java +1. package org.myorg; +2. +3. import java.io.IOException; +4. import java.util.*; +5. +6. import org.apache.hadoop.fs.Path; +7. import org.apache.hadoop.conf.*; +8. import org.apache.hadoop.io.*; +9. import org.apache.hadoop.mapred.*; +10. import org.apache.hadoop.util.*; +11. +12. public class WordCount { +13. +14. public static class Map extends MapReduceBase implements Mapper { +15. private final static IntWritable one = new IntWritable(1); +16. private Text word = new Text(); +17. +18. public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { +19. String line = value.toString(); +20. StringTokenizer tokenizer = new StringTokenizer(line); +21. while (tokenizer.hasMoreTokens()) { +22. word.set(tokenizer.nextToken()); +23. output.collect(word, one); +24. } +25. } +26. } +27. +28. public static class Reduce extends MapReduceBase implements Reducer { +29. public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { +30. int sum = 0; +31. while (values.hasNext()) { +32. sum += values.next().get(); +33. } +34. output.collect(key, new IntWritable(sum)); +35. } +36. } +37. +38. public static void main(String[] args) throws Exception { +39. JobConf conf = new JobConf(WordCount.class); +40. conf.setJobName("wordcount"); +41. +42. conf.setOutputKeyClass(Text.class); +43. conf.setOutputValueClass(IntWritable.class); +44. +45. conf.setMapperClass(Map.class); +46. conf.setCombinerClass(Reduce.class); +47. conf.setReducerClass(Reduce.class); +48. +49. conf.setInputFormat(TextInputFormat.class); +50. conf.setOutputFormat(TextOutputFormat.class); +51. +52. FileInputFormat.setInputPaths(conf, new Path(args[0])); +53. FileOutputFormat.setOutputPath(conf, new Path(args[1])); +54. +55. JobClient.runJob(conf); +57. } +58. } +59. +Usage +Assuming HADOOP_HOME is the root of the installation and HADOOP_VERSION is the Hadoop version installed, compile WordCount.java and create a jar: + +$ mkdir wordcount_classes +$ javac -classpath ${HADOOP_HOME}/hadoop-${HADOOP_VERSION}-core.jar -d wordcount_classes WordCount.java +$ jar -cvf /usr/joe/wordcount.jar -C wordcount_classes/ . + +Assuming that: + +/usr/joe/wordcount/input - input directory in HDFS +/usr/joe/wordcount/output - output directory in HDFS +Sample text-files as input: + +$ bin/hadoop dfs -ls /usr/joe/wordcount/input/ +/usr/joe/wordcount/input/file01 +/usr/joe/wordcount/input/file02 + +$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01 +Hello World Bye World + +$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02 +Hello Hadoop Goodbye Hadoop + +Run the application: + +$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount /usr/joe/wordcount/input /usr/joe/wordcount/output + +Output: + +$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 +Bye 1 +Goodbye 1 +Hadoop 2 +Hello 2 +World 2 +Applications can specify a comma separated list of paths which would be present in the current working directory of the task using the option -files. The -libjars option allows applications to add jars to the classpaths of the maps and reduces. The option -archives allows them to pass comma separated list of archives as arguments. These archives are unarchived and a link with name of the archive is created in the current working directory of tasks. More details about the command line options are available at Commands Guide. + +Running wordcount example with -libjars, -files and -archives: +hadoop jar hadoop-examples.jar wordcount -files cachefile.txt -libjars mylib.jar -archives myarchive.zip input output Here, myarchive.zip will be placed and unzipped into a directory by the name "myarchive.zip". + +Users can specify a different symbolic name for files and archives passed through -files and -archives option, using #. + +For example, hadoop jar hadoop-examples.jar wordcount -files dir1/dict.txt#dict1,dir2/dict.txt#dict2 -archives mytar.tgz#tgzdir input output Here, the files dir1/dict.txt and dir2/dict.txt can be accessed by tasks using the symbolic names dict1 and dict2 respectively. The archive mytar.tgz will be placed and unarchived into a directory by the name "tgzdir". + +Walk-through +The WordCount application is quite straight-forward. + +The Mapper implementation (lines 14-26), via the map method (lines 18-25), processes one line at a time, as provided by the specified TextInputFormat (line 49). It then splits the line into tokens separated by whitespaces, via the StringTokenizer, and emits a key-value pair of < , 1>. + +For the given sample input the first map emits: +< Hello, 1> +< World, 1> +< Bye, 1> +< World, 1> +The second map emits: +< Hello, 1> +< Hadoop, 1> +< Goodbye, 1> +< Hadoop, 1> +We'll learn more about the number of maps spawned for a given job, and how to control them in a fine-grained manner, a bit later in the tutorial. + +WordCount also specifies a combiner (line 46). Hence, the output of each map is passed through the local combiner (which is same as the Reducer as per the job configuration) for local aggregation, after being sorted on the keys. + +The output of the first map: +< Bye, 1> +< Hello, 1> +< World, 2> +The output of the second map: +< Goodbye, 1> +< Hadoop, 2> +< Hello, 1> +The Reducer implementation (lines 28-36), via the reduce method (lines 29-35) just sums up the values, which are the occurence counts for each key (i.e. words in this example). + +Thus the output of the job is: +< Bye, 1> +< Goodbye, 1> +< Hadoop, 2> +< Hello, 2> +< World, 2> +The run method specifies various facets of the job, such as the input/output paths (passed via the command line), key/value types, input/output formats etc., in the JobConf. It then calls the JobClient.runJob (line 55) to submit the and monitor its progress. + +We'll learn more about JobConf, JobClient, Tool and other interfaces and classes a bit later in the tutorial. + +MapReduce - User Interfaces +This section provides a reasonable amount of detail on every user-facing aspect of the MapReduce framework. This should help users implement, configure and tune their jobs in a fine-grained manner. However, please note that the javadoc for each class/interface remains the most comprehensive documentation available; this is only meant to be a tutorial. + +Let us first take the Mapper and Reducer interfaces. Applications typically implement them to provide the map and reduce methods. + +We will then discuss other core interfaces including JobConf, JobClient, Partitioner, OutputCollector, Reporter, InputFormat, OutputFormat, OutputCommitter and others. + +Finally, we will wrap up by discussing some useful features of the framework such as the DistributedCache, IsolationRunner etc. + +Payload +Applications typically implement the Mapper and Reducer interfaces to provide the map and reduce methods. These form the core of the job. + +Mapper +Mapper maps input key/value pairs to a set of intermediate key/value pairs. + +Maps are the individual tasks that transform input records into intermediate records. The transformed intermediate records do not need to be of the same type as the input records. A given input pair may map to zero or many output pairs. + +The Hadoop MapReduce framework spawns one map task for each InputSplit generated by the InputFormat for the job. + +Overall, Mapper implementations are passed the JobConf for the job via the JobConfigurable.configure(JobConf) method and override it to initialize themselves. The framework then calls map(WritableComparable, Writable, OutputCollector, Reporter) for each key/value pair in the InputSplit for that task. Applications can then override the Closeable.close() method to perform any required cleanup. + +Output pairs do not need to be of the same types as input pairs. A given input pair may map to zero or many output pairs. Output pairs are collected with calls to OutputCollector.collect(WritableComparable,Writable). + +Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. + +All intermediate values associated with a given output key are subsequently grouped by the framework, and passed to the Reducer(s) to determine the final output. Users can control the grouping by specifying a Comparator via JobConf.setOutputKeyComparatorClass(Class). + +The Mapper outputs are sorted and then partitioned per Reducer. The total number of partitions is the same as the number of reduce tasks for the job. Users can control which keys (and hence records) go to which Reducer by implementing a custom Partitioner. + +Users can optionally specify a combiner, via JobConf.setCombinerClass(Class), to perform local aggregation of the intermediate outputs, which helps to cut down the amount of data transferred from the Mapper to the Reducer. + +The intermediate, sorted outputs are always stored in a simple (key-len, key, value-len, value) format. Applications can control if, and how, the intermediate outputs are to be compressed and the CompressionCodec to be used via the JobConf. + +How Many Maps? +The number of maps is usually driven by the total size of the inputs, that is, the total number of blocks of the input files. + +The right level of parallelism for maps seems to be around 10-100 maps per-node, although it has been set up to 300 maps for very cpu-light map tasks. Task setup takes awhile, so it is best if the maps take at least a minute to execute. + +Thus, if you expect 10TB of input data and have a blocksize of 128MB, you'll end up with 82,000 maps, unless setNumMapTasks(int) (which only provides a hint to the framework) is used to set it even higher. + +Reducer +Reducer reduces a set of intermediate values which share a key to a smaller set of values. + +The number of reduces for the job is set by the user via JobConf.setNumReduceTasks(int). + +Overall, Reducer implementations are passed the JobConf for the job via the JobConfigurable.configure(JobConf) method and can override it to initialize themselves. The framework then calls reduce(WritableComparable, Iterator, OutputCollector, Reporter) method for each pair in the grouped inputs. Applications can then override the Closeable.close() method to perform any required cleanup. + +Reducer has 3 primary phases: shuffle, sort and reduce. + +Shuffle +Input to the Reducer is the sorted output of the mappers. In this phase the framework fetches the relevant partition of the output of all the mappers, via HTTP. + +Sort +The framework groups Reducer inputs by keys (since different mappers may have output the same key) in this stage. + +The shuffle and sort phases occur simultaneously; while map-outputs are being fetched they are merged. + +Secondary Sort +If equivalence rules for grouping the intermediate keys are required to be different from those for grouping keys before reduction, then one may specify a Comparator via JobConf.setOutputValueGroupingComparator(Class). Since JobConf.setOutputKeyComparatorClass(Class) can be used to control how intermediate keys are grouped, these can be used in conjunction to simulate secondary sort on values. + +Reduce +In this phase the reduce(WritableComparable, Iterator, OutputCollector, Reporter) method is called for each pair in the grouped inputs. + +The output of the reduce task is typically written to the FileSystem via OutputCollector.collect(WritableComparable, Writable). + +Applications can use the Reporter to report progress, set application-level status messages and update Counters, or just indicate that they are alive. + +The output of the Reducer is not sorted. + +How Many Reduces? +The right number of reduces seems to be 0.95 or 1.75 multiplied by ( * mapred.tasktracker.reduce.tasks.maximum). + +With 0.95 all of the reduces can launch immediately and start transfering map outputs as the maps finish. With 1.75 the faster nodes will finish their first round of reduces and launch a second wave of reduces doing a much better job of load balancing. + +Increasing the number of reduces increases the framework overhead, but increases load balancing and lowers the cost of failures. + +The scaling factors above are slightly less than whole numbers to reserve a few reduce slots in the framework for speculative-tasks and failed tasks. + +Reducer NONE +It is legal to set the number of reduce-tasks to zero if no reduction is desired. + +In this case the outputs of the map-tasks go directly to the FileSystem, into the output path set by setOutputPath(Path). The framework does not sort the map-outputs before writing them out to the FileSystem. + +Partitioner +Partitioner partitions the key space. + +Partitioner controls the partitioning of the keys of the intermediate map-outputs. The key (or a subset of the key) is used to derive the partition, typically by a hash function. The total number of partitions is the same as the number of reduce tasks for the job. Hence this controls which of the m reduce tasks the intermediate key (and hence the record) is sent to for reduction. + +HashPartitioner is the default Partitioner. + +Reporter +Reporter is a facility for MapReduce applications to report progress, set application-level status messages and update Counters. + +Mapper and Reducer implementations can use the Reporter to report progress or just indicate that they are alive. In scenarios where the application takes a significant amount of time to process individual key/value pairs, this is crucial since the framework might assume that the task has timed-out and kill that task. Another way to avoid this is to set the configuration parameter mapred.task.timeout to a high-enough value (or even set it to zero for no time-outs). + +Applications can also update Counters using the Reporter. + +OutputCollector +OutputCollector is a generalization of the facility provided by the MapReduce framework to collect data output by the Mapper or the Reducer (either the intermediate outputs or the output of the job). + +Hadoop MapReduce comes bundled with a library of generally useful mappers, reducers, and partitioners. + +Job Configuration +JobConf represents a MapReduce job configuration. + +JobConf is the primary interface for a user to describe a MapReduce job to the Hadoop framework for execution. The framework tries to faithfully execute the job as described by JobConf, however: + +f Some configuration parameters may have been marked as final by administrators and hence cannot be altered. +While some job parameters are straight-forward to set (e.g. setNumReduceTasks(int)), other parameters interact subtly with the rest of the framework and/or job configuration and are more complex to set (e.g. setNumMapTasks(int)). +JobConf is typically used to specify the Mapper, combiner (if any), Partitioner, Reducer, InputFormat, OutputFormat and OutputCommitter implementations. JobConf also indicates the set of input files (setInputPaths(JobConf, Path...) /addInputPath(JobConf, Path)) and (setInputPaths(JobConf, String) /addInputPaths(JobConf, String)) and where the output files should be written (setOutputPath(Path)). + +Optionally, JobConf is used to specify other advanced facets of the job such as the Comparator to be used, files to be put in the DistributedCache, whether intermediate and/or job outputs are to be compressed (and how), debugging via user-provided scripts (setMapDebugScript(String)/setReduceDebugScript(String)) , whether job tasks can be executed in a speculative manner (setMapSpeculativeExecution(boolean))/(setReduceSpeculativeExecution(boolean)) , maximum number of attempts per task (setMaxMapAttempts(int)/setMaxReduceAttempts(int)) , percentage of tasks failure which can be tolerated by the job (setMaxMapTaskFailuresPercent(int)/setMaxReduceTaskFailuresPercent(int)) etc. + +Of course, users can use set(String, String)/get(String, String) to set/get arbitrary parameters needed by applications. However, use the DistributedCache for large amounts of (read-only) data. + +Task Execution & Environment +The TaskTracker executes the Mapper/ Reducer task as a child process in a separate jvm. + +The child-task inherits the environment of the parent TaskTracker. The user can specify additional options to the child-jvm via the mapred.{map|reduce}.child.java.opts configuration parameter in the JobConf such as non-standard paths for the run-time linker to search shared libraries via -Djava.library.path=<> etc. If the mapred.{map|reduce}.child.java.opts parameters contains the symbol @taskid@ it is interpolated with value of taskid of the MapReduce task. + +Here is an example with multiple arguments and substitutions, showing jvm GC logging, and start of a passwordless JVM JMX agent so that it can connect with jconsole and the likes to watch child memory, threads and get thread dumps. It also sets the maximum heap-size of the map and reduce child jvm to 512MB & 1024MB respectively. It also adds an additional path to the java.library.path of the child-jvm. + + + mapred.map.child.java.opts + + -Xmx512M -Djava.library.path=/home/mycompany/lib -verbose:gc -Xloggc:/tmp/@taskid@.gc + -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false + + + + + mapred.reduce.child.java.opts + + -Xmx1024M -Djava.library.path=/home/mycompany/lib -verbose:gc -Xloggc:/tmp/@taskid@.gc + -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false + + + +Memory Management +Users/admins can also specify the maximum virtual memory of the launched child-task, and any sub-process it launches recursively, using mapred.{map|reduce}.child.ulimit. Note that the value set here is a per process limit. The value for mapred.{map|reduce}.child.ulimit should be specified in kilo bytes (KB). And also the value must be greater than or equal to the -Xmx passed to JavaVM, else the VM might not start. + +Note: mapred.{map|reduce}.child.java.opts are used only for configuring the launched child tasks from task tracker. Configuring the memory options for daemons is documented in Configuring the Environment of the Hadoop Daemons. + +The memory available to some parts of the framework is also configurable. In map and reduce tasks, performance may be influenced by adjusting parameters influencing the concurrency of operations and the frequency with which data will hit disk. Monitoring the filesystem counters for a job- particularly relative to byte counts from the map and into the reduce- is invaluable to the tuning of these parameters. + +Users can choose to override default limits of Virtual Memory and RAM enforced by the task tracker, if memory management is enabled. Users can set the following parameter per job: + +Name Type Description +mapred.task.maxvmem int A number, in bytes, that represents the maximum Virtual Memory task-limit for each task of the job. A task will be killed if it consumes more Virtual Memory than this number. +mapred.task.maxpmem int A number, in bytes, that represents the maximum RAM task-limit for each task of the job. This number can be optionally used by Schedulers to prevent over-scheduling of tasks on a node based on RAM needs. +Map Parameters +A record emitted from a map will be serialized into a buffer and metadata will be stored into accounting buffers. As described in the following options, when either the serialization buffer or the metadata exceed a threshold, the contents of the buffers will be sorted and written to disk in the background while the map continues to output records. If either buffer fills completely while the spill is in progress, the map thread will block. When the map is finished, any remaining records are written to disk and all on-disk segments are merged into a single file. Minimizing the number of spills to disk can decrease map time, but a larger buffer also decreases the memory available to the mapper. + +Name Type Description +io.sort.mb int The cumulative size of the serialization and accounting buffers storing records emitted from the map, in megabytes. +io.sort.record.percent float The ratio of serialization to accounting space can be adjusted. Each serialized record requires 16 bytes of accounting information in addition to its serialized size to effect the sort. This percentage of space allocated from io.sort.mb affects the probability of a spill to disk being caused by either exhaustion of the serialization buffer or the accounting space. Clearly, for a map outputting small records, a higher value than the default will likely decrease the number of spills to disk. +io.sort.spill.percent float This is the threshold for the accounting and serialization buffers. When this percentage of either buffer has filled, their contents will be spilled to disk in the background. Let io.sort.record.percent be r, io.sort.mb be x, and this value be q. The maximum number of records collected before the collection thread will spill is r * x * q * 2^16. Note that a higher value may decrease the number of- or even eliminate- merges, but will also increase the probability of the map task getting blocked. The lowest average map times are usually obtained by accurately estimating the size of the map output and preventing multiple spills. +Other notes + +If either spill threshold is exceeded while a spill is in progress, collection will continue until the spill is finished. For example, if io.sort.buffer.spill.percent is set to 0.33, and the remainder of the buffer is filled while the spill runs, the next spill will include all the collected records, or 0.66 of the buffer, and will not generate additional spills. In other words, the thresholds are defining triggers, not blocking. +A record larger than the serialization buffer will first trigger a spill, then be spilled to a separate file. It is undefined whether or not this record will first pass through the combiner. +Shuffle/Reduce Parameters +As described previously, each reduce fetches the output assigned to it by the Partitioner via HTTP into memory and periodically merges these outputs to disk. If intermediate compression of map outputs is turned on, each output is decompressed into memory. The following options affect the frequency of these merges to disk prior to the reduce and the memory allocated to map output during the reduce. + +Name Type Description +io.sort.factor int Specifies the number of segments on disk to be merged at the same time. It limits the number of open files and compression codecs during the merge. If the number of files exceeds this limit, the merge will proceed in several passes. Though this limit also applies to the map, most jobs should be configured so that hitting this limit is unlikely there. +mapred.inmem.merge.threshold int The number of sorted map outputs fetched into memory before being merged to disk. Like the spill thresholds in the preceding note, this is not defining a unit of partition, but a trigger. In practice, this is usually set very high (1000) or disabled (0), since merging in-memory segments is often less expensive than merging from disk (see notes following this table). This threshold influences only the frequency of in-memory merges during the shuffle. +mapred.job.shuffle.merge.percent float The memory threshold for fetched map outputs before an in-memory merge is started, expressed as a percentage of memory allocated to storing map outputs in memory. Since map outputs that can't fit in memory can be stalled, setting this high may decrease parallelism between the fetch and merge. Conversely, values as high as 1.0 have been effective for reduces whose input can fit entirely in memory. This parameter influences only the frequency of in-memory merges during the shuffle. +mapred.job.shuffle.input.buffer.percent float The percentage of memory- relative to the maximum heapsize as typically specified in mapred.reduce.child.java.opts- that can be allocated to storing map outputs during the shuffle. Though some memory should be set aside for the framework, in general it is advantageous to set this high enough to store large and numerous map outputs. +mapred.job.reduce.input.buffer.percent float The percentage of memory relative to the maximum heapsize in which map outputs may be retained during the reduce. When the reduce begins, map outputs will be merged to disk until those that remain are under the resource limit this defines. By default, all map outputs are merged to disk before the reduce begins to maximize the memory available to the reduce. For less memory-intensive reduces, this should be increased to avoid trips to disk. +Other notes + +If a map output is larger than 25 percent of the memory allocated to copying map outputs, it will be written directly to disk without first staging through memory. +When running with a combiner, the reasoning about high merge thresholds and large buffers may not hold. For merges started before all map outputs have been fetched, the combiner is run while spilling to disk. In some cases, one can obtain better reduce times by spending resources combining map outputs- making disk spills small and parallelizing spilling and fetching- rather than aggressively increasing buffer sizes. +When merging in-memory map outputs to disk to begin the reduce, if an intermediate merge is necessary because there are segments to spill and at least io.sort.factor segments already on disk, the in-memory map outputs will be part of the intermediate merge. +Directory Structure +The task tracker has local directory, ${mapred.local.dir}/taskTracker/ to create localized cache and localized job. It can define multiple local directories (spanning multiple disks) and then each filename is assigned to a semi-random local directory. When the job starts, task tracker creates a localized job directory relative to the local directory specified in the configuration. Thus the task tracker directory structure looks as following: + +${mapred.local.dir}/taskTracker/distcache/ : The public distributed cache for the jobs of all users. This directory holds the localized public distributed cache. Thus localized public distributed cache is shared among all the tasks and jobs of all users. +${mapred.local.dir}/taskTracker/$user/distcache/ : The private distributed cache for the jobs of the specific user. This directory holds the localized private distributed cache. Thus localized private distributed cache is shared among all the tasks and jobs of the specific user only. It is not accessible to jobs of other users. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/ : The localized job directory +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/work/ : The job-specific shared directory. The tasks can use this space as scratch space and share files among them. This directory is exposed to the users through the configuration property job.local.dir. The directory can accessed through the API JobConf.getJobLocalDir(). It is available as System property also. So, users (streaming etc.) can call System.getProperty("job.local.dir") to access the directory. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/jars/ : The jars directory, which has the job jar file and expanded jar. The job.jar is the application's jar file that is automatically distributed to each machine. It is expanded in jars directory before the tasks for the job start. The job.jar location is accessible to the application through the api JobConf.getJar() . To access the unjarred directory, JobConf.getJar().getParent() can be called. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/job.xml : The job.xml file, the generic job configuration, localized for the job. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid : The task directory for each task attempt. Each task directory again has the following structure : +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/job.xml : A job.xml file, task localized job configuration, Task localization means that properties have been set that are specific to this particular task within the job. The properties localized for each task are described below. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/output : A directory for intermediate output files. This contains the temporary map reduce data generated by the framework such as map output files etc. +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/work : The current working directory of the task. With jvm reuse enabled for tasks, this directory will be the directory on which the jvm has started +${mapred.local.dir}/taskTracker/$user/jobcache/$jobid/$taskid/work/tmp : The temporary directory for the task. (User can specify the property mapred.child.tmp to set the value of temporary directory for map and reduce tasks. This defaults to ./tmp. If the value is not an absolute path, it is prepended with task's working directory. Otherwise, it is directly assigned. The directory will be created if it doesn't exist. Then, the child java tasks are executed with option -Djava.io.tmpdir='the absolute path of the tmp dir'. Pipes and streaming are set with environment variable, TMPDIR='the absolute path of the tmp dir'). This directory is created, if mapred.child.tmp has the value ./tmp +Task JVM Reuse +Jobs can enable task JVMs to be reused by specifying the job configuration mapred.job.reuse.jvm.num.tasks. If the value is 1 (the default), then JVMs are not reused (i.e. 1 task per JVM). If it is -1, there is no limit to the number of tasks a JVM can run (of the same job). One can also specify some value greater than 1 using the api JobConf.setNumTasksToExecutePerJvm(int) + +Configured Parameters +The following properties are localized in the job configuration for each task's execution: + +Name Type Description +mapred.job.id String The job id +mapred.jar String job.jar location in job directory +job.local.dir String The job specific shared scratch space +mapred.tip.id String The task id +mapred.task.id String The task attempt id +mapred.task.is.map boolean Is this a map task +mapred.task.partition int The id of the task within the job +map.input.file String The filename that the map is reading from +map.input.start long The offset of the start of the map input split +map.input.length long The number of bytes in the map input split +mapred.work.output.dir String The task's temporary output directory +Note: During the execution of a streaming job, the names of the "mapred" parameters are transformed. The dots ( . ) become underscores ( _ ). For example, mapred.job.id becomes mapred_job_id and mapred.jar becomes mapred_jar. To get the values in a streaming job's mapper/reducer use the parameter names with the underscores. + +Task Logs +The standard output (stdout) and error (stderr) streams of the task are read by the TaskTracker and logged to ${HADOOP_LOG_DIR}/userlogs + +Distributing Libraries +The DistributedCache can also be used to distribute both jars and native libraries for use in the map and/or reduce tasks. The child-jvm always has its current working directory added to the java.library.path and LD_LIBRARY_PATH. And hence the cached libraries can be loaded via System.loadLibrary or System.load. More details on how to load shared libraries through distributed cache are documented at native_libraries.html + +Job Submission and Monitoring +JobClient is the primary interface by which user-job interacts with the JobTracker. + +JobClient provides facilities to submit jobs, track their progress, access component-tasks' reports and logs, get the MapReduce cluster's status information and so on. + +The job submission process involves: + +Checking the input and output specifications of the job. +Computing the InputSplit values for the job. +Setting up the requisite accounting information for the DistributedCache of the job, if necessary. +Copying the job's jar and configuration to the MapReduce system directory on the FileSystem. +Submitting the job to the JobTracker and optionally monitoring it's status. +Job history files are also logged to user specified directory hadoop.job.history.user.location which defaults to job output directory. The files are stored in "_logs/history/" in the specified directory. Hence, by default they will be in mapred.output.dir/_logs/history. User can stop logging by giving the value none for hadoop.job.history.user.location + +User can view the history logs summary in specified directory using the following command +$ bin/hadoop job -history output-dir +This command will print job details, failed and killed tip details. +More details about the job such as successful tasks and task attempts made for each task can be viewed using the following command +$ bin/hadoop job -history all output-dir +User can use OutputLogFilter to filter log files from the output directory listing. + +Normally the user creates the application, describes various facets of the job via JobConf, and then uses the JobClient to submit the job and monitor its progress. + +Job Authorization +Job level authorization and queue level authorization are enabled on the cluster, if the configuration mapred.acls.enabled is set to true. When enabled, access control checks are done by (a) the JobTracker before allowing users to submit jobs to queues and administering these jobs and (b) by the JobTracker and the TaskTracker before allowing users to view job details or to modify a job using MapReduce APIs, CLI or web user interfaces. + +A job submitter can specify access control lists for viewing or modifying a job via the configuration properties mapreduce.job.acl-view-job and mapreduce.job.acl-modify-job respectively. By default, nobody is given access in these properties. + +However, irrespective of the job ACLs configured, a job's owner, the superuser and cluster administrators (mapreduce.cluster.administrators) and queue administrators of the queue to which the job was submitted to (mapred.queue.queue-name.acl-administer-jobs) always have access to view and modify a job. + +A job view ACL authorizes users against the configured mapreduce.job.acl-view-job before returning possibly sensitive information about a job, like: + +job level counters +task level counters +tasks's diagnostic information +task logs displayed on the TaskTracker web UI +job.xml showed by the JobTracker's web UI +Other information about a job, like its status and its profile, is accessible to all users, without requiring authorization. + +A job modification ACL authorizes users against the configured mapreduce.job.acl-modify-job before allowing modifications to jobs, like: + +killing a job +killing/failing a task of a job +setting the priority of a job +These operations are also permitted by the queue level ACL, "mapred.queue.queue-name.acl-administer-jobs", configured via mapred-queue-acls.xml. The caller will be able to do the operation if he/she is part of either queue admins ACL or job modification ACL. + +The format of a job level ACL is the same as the format for a queue level ACL as defined in the Cluster Setup documentation. + +Job Control +Users may need to chain MapReduce jobs to accomplish complex tasks which cannot be done via a single MapReduce job. This is fairly easy since the output of the job typically goes to distributed file-system, and the output, in turn, can be used as the input for the next job. + +However, this also means that the onus on ensuring jobs are complete (success/failure) lies squarely on the clients. In such cases, the various job-control options are: + +runJob(JobConf) : Submits the job and returns only after the job has completed. +submitJob(JobConf) : Only submits the job, then poll the returned handle to the RunningJob to query status and make scheduling decisions. +JobConf.setJobEndNotificationURI(String) : Sets up a notification upon job-completion, thus avoiding polling. +Job Credentials +In a secure cluster, the user is authenticated via Kerberos' kinit command. Because of scalability concerns, we don't push the client's Kerberos' tickets in MapReduce jobs. Instead, we acquire delegation tokens from each HDFS NameNode that the job will use and store them in the job as part of job submission. The delegation tokens are automatically obtained for the HDFS that holds the staging directories, where the job job files are written, and any HDFS systems referenced by FileInputFormats, FileOutputFormats, DistCp, and the distributed cache. Other applications require to set the configuration "mapreduce.job.hdfs-servers" for all NameNodes that tasks might need to talk during the job execution. This is a comma separated list of file system names, such as "hdfs://nn1/,hdfs://nn2/". These tokens are passed to the JobTracker as part of the job submission as Credentials. + +Similar to HDFS delegation tokens, we also have MapReduce delegation tokens. The MapReduce tokens are provided so that tasks can spawn jobs if they wish to. The tasks authenticate to the JobTracker via the MapReduce delegation tokens. The delegation token can be obtained via the API in JobClient.getDelegationToken. The obtained token must then be pushed onto the credentials that is there in the JobConf used for job submission. The API Credentials.addToken can be used for this. + +The credentials are sent to the JobTracker as part of the job submission process. The JobTracker persists the tokens and secrets in its filesystem (typically HDFS) in a file within mapred.system.dir/JOBID. The TaskTracker localizes the file as part job localization. Tasks see an environment variable called HADOOP_TOKEN_FILE_LOCATION and the framework sets this to point to the localized file. In order to launch jobs from tasks or for doing any HDFS operation, tasks must set the configuration "mapreduce.job.credentials.binary" to point to this token file. + +The HDFS delegation tokens passed to the JobTracker during job submission are are cancelled by the JobTracker when the job completes. This is the default behavior unless mapreduce.job.complete.cancel.delegation.tokens is set to false in the JobConf. For jobs whose tasks in turn spawns jobs, this should be set to false. Applications sharing JobConf objects between multiple jobs on the JobClient side should look at setting mapreduce.job.complete.cancel.delegation.tokens to false. This is because the Credentials object within the JobConf will then be shared. All jobs will end up sharing the same tokens, and hence the tokens should not be canceled when the jobs in the sequence finish. + +Apart from the HDFS delegation tokens, arbitrary secrets can also be passed during the job submission for tasks to access other third party services. The APIs JobConf.getCredentials or JobContext.getCredentials() should be used to get the credentials object and then Credentials.addSecretKey should be used to add secrets. + +For applications written using the old MapReduce API, the Mapper/Reducer classes need to implement JobConfigurable in order to get access to the credentials in the tasks. A reference to the JobConf passed in the JobConfigurable.configure should be stored. In the new MapReduce API, a similar thing can be done in the Mapper.setup method. The api JobConf.getCredentials() or the api JobContext.getCredentials() should be used to get the credentials reference (depending on whether the new MapReduce API or the old MapReduce API is used). Tasks can access the secrets using the APIs in Credentials + +Job Input +InputFormat describes the input-specification for a MapReduce job. + +The MapReduce framework relies on the InputFormat of the job to: + +Validate the input-specification of the job. +Split-up the input file(s) into logical InputSplit instances, each of which is then assigned to an individual Mapper. +Provide the RecordReader implementation used to glean input records from the logical InputSplit for processing by the Mapper. +The default behavior of file-based InputFormat implementations, typically sub-classes of FileInputFormat, is to split the input into logical InputSplit instances based on the total size, in bytes, of the input files. However, the FileSystem blocksize of the input files is treated as an upper bound for input splits. A lower bound on the split size can be set via mapred.min.split.size. + +Clearly, logical splits based on input-size is insufficient for many applications since record boundaries must be respected. In such cases, the application should implement a RecordReader, who is responsible for respecting record-boundaries and presents a record-oriented view of the logical InputSplit to the individual task. + +TextInputFormat is the default InputFormat. + +If TextInputFormat is the InputFormat for a given job, the framework detects input-files with the .gz extensions and automatically decompresses them using the appropriate CompressionCodec. However, it must be noted that compressed files with the above extensions cannot be split and each compressed file is processed in its entirety by a single mapper. + +InputSplit +InputSplit represents the data to be processed by an individual Mapper. + +Typically InputSplit presents a byte-oriented view of the input, and it is the responsibility of RecordReader to process and present a record-oriented view. + +FileSplit is the default InputSplit. It sets map.input.file to the path of the input file for the logical split. + +RecordReader +RecordReader reads pairs from an InputSplit. + +Typically the RecordReader converts the byte-oriented view of the input, provided by the InputSplit, and presents a record-oriented to the Mapper implementations for processing. RecordReader thus assumes the responsibility of processing record boundaries and presents the tasks with keys and values. + +Job Output +OutputFormat describes the output-specification for a MapReduce job. + +The MapReduce framework relies on the OutputFormat of the job to: + +Validate the output-specification of the job; for example, check that the output directory doesn't already exist. +Provide the RecordWriter implementation used to write the output files of the job. Output files are stored in a FileSystem. +TextOutputFormat is the default OutputFormat. + +OutputCommitter +OutputCommitter describes the commit of task output for a MapReduce job. + +The MapReduce framework relies on the OutputCommitter of the job to: + +Setup the job during initialization. For example, create the temporary output directory for the job during the initialization of the job. Job setup is done by a separate task when the job is in PREP state and after initializing tasks. Once the setup task completes, the job will be moved to RUNNING state. +Cleanup the job after the job completion. For example, remove the temporary output directory after the job completion. Job cleanup is done by a separate task at the end of the job. Job is declared SUCCEDED/FAILED/KILLED after the cleanup task completes. +Setup the task temporary output. Task setup is done as part of the same task, during task initialization. +Check whether a task needs a commit. This is to avoid the commit procedure if a task does not need commit. +Commit of the task output. Once task is done, the task will commit it's output if required. +Discard the task commit. If the task has been failed/killed, the output will be cleaned-up. If task could not cleanup (in exception block), a separate task will be launched with same attempt-id to do the cleanup. +FileOutputCommitter is the default OutputCommitter. Job setup/cleanup tasks occupy map or reduce slots, whichever is free on the TaskTracker. And JobCleanup task, TaskCleanup tasks and JobSetup task have the highest priority, and in that order. + +Task Side-Effect Files +In some applications, component tasks need to create and/or write to side-files, which differ from the actual job-output files. + +In such cases there could be issues with two instances of the same Mapper or Reducer running simultaneously (for example, speculative tasks) trying to open and/or write to the same file (path) on the FileSystem. Hence the application-writer will have to pick unique names per task-attempt (using the attemptid, say attempt_200709221812_0001_m_000000_0), not just per task. + +To avoid these issues the MapReduce framework, when the OutputCommitter is FileOutputCommitter, maintains a special ${mapred.output.dir}/_temporary/_${taskid} sub-directory accessible via ${mapred.work.output.dir} for each task-attempt on the FileSystem where the output of the task-attempt is stored. On successful completion of the task-attempt, the files in the ${mapred.output.dir}/_temporary/_${taskid} (only) are promoted to ${mapred.output.dir}. Of course, the framework discards the sub-directory of unsuccessful task-attempts. This process is completely transparent to the application. + +The application-writer can take advantage of this feature by creating any side-files required in ${mapred.work.output.dir} during execution of a task via FileOutputFormat.getWorkOutputPath(), and the framework will promote them similarly for succesful task-attempts, thus eliminating the need to pick unique paths per task-attempt. + +Note: The value of ${mapred.work.output.dir} during execution of a particular task-attempt is actually ${mapred.output.dir}/_temporary/_{$taskid}, and this value is set by the MapReduce framework. So, just create any side-files in the path returned by FileOutputFormat.getWorkOutputPath() from MapReduce task to take advantage of this feature. + +The entire discussion holds true for maps of jobs with reducer=NONE (i.e. 0 reduces) since output of the map, in that case, goes directly to HDFS. + +RecordWriter +RecordWriter writes the output pairs to an output file. + +RecordWriter implementations write the job outputs to the FileSystem. + +Other Useful Features +Submitting Jobs to Queues +Users submit jobs to Queues. Queues, as collection of jobs, allow the system to provide specific functionality. For example, queues use ACLs to control which users who can submit jobs to them. Queues are expected to be primarily used by Hadoop Schedulers. + +Hadoop comes configured with a single mandatory queue, called 'default'. Queue names are defined in the mapred.queue.names property of the Hadoop site configuration. Some job schedulers, such as the Capacity Scheduler, support multiple queues. + +A job defines the queue it needs to be submitted to through the mapred.job.queue.name property, or through the setQueueName(String) API. Setting the queue name is optional. If a job is submitted without an associated queue name, it is submitted to the 'default' queue. + +Counters +Counters represent global counters, defined either by the MapReduce framework or applications. Each Counter can be of any Enum type. Counters of a particular Enum are bunched into groups of type Counters.Group. + +Applications can define arbitrary Counters (of type Enum) and update them via Reporter.incrCounter(Enum, long) or Reporter.incrCounter(String, String, long) in the map and/or reduce methods. These counters are then globally aggregated by the framework. + +DistributedCache +DistributedCache distributes application-specific, large, read-only files efficiently. + +DistributedCache is a facility provided by the MapReduce framework to cache files (text, archives, jars and so on) needed by applications. + +Applications specify the files to be cached via urls (hdfs://) in the JobConf. The DistributedCache assumes that the files specified via hdfs:// urls are already present on the FileSystem. + +The framework will copy the necessary files to the slave node before any tasks for the job are executed on that node. Its efficiency stems from the fact that the files are only copied once per job and the ability to cache archives which are un-archived on the slaves. + +DistributedCache tracks the modification timestamps of the cached files. Clearly the cache files should not be modified by the application or externally while the job is executing. + +DistributedCache can be used to distribute simple, read-only data/text files and more complex types such as archives and jars. Archives (zip, tar, tgz and tar.gz files) are un-archived at the slave nodes. Files have execution permissions set. + +The files/archives can be distributed by setting the property mapred.cache.{files|archives}. If more than one file/archive has to be distributed, they can be added as comma separated paths. The properties can also be set by APIs DistributedCache.addCacheFile(URI,conf)/ DistributedCache.addCacheArchive(URI,conf) and DistributedCache.setCacheFiles(URIs,conf)/ DistributedCache.setCacheArchives(URIs,conf) where URI is of the form hdfs://host:port/absolute-path#link-name. In Streaming, the files can be distributed through command line option -cacheFile/-cacheArchive. + +Optionally users can also direct the DistributedCache to symlink the cached file(s) into the current working directory of the task via the DistributedCache.createSymlink(Configuration) api. Or by setting the configuration property mapred.create.symlink as yes. The DistributedCache will use the fragment of the URI as the name of the symlink. For example, the URI hdfs://namenode:port/lib.so.1#lib.so will have the symlink name as lib.so in task's cwd for the file lib.so.1 in distributed cache. + +The DistributedCache can also be used as a rudimentary software distribution mechanism for use in the map and/or reduce tasks. It can be used to distribute both jars and native libraries. The DistributedCache.addArchiveToClassPath(Path, Configuration) or DistributedCache.addFileToClassPath(Path, Configuration) api can be used to cache files/jars and also add them to the classpath of child-jvm. The same can be done by setting the configuration properties mapred.job.classpath.{files|archives}. Similarly the cached files that are symlinked into the working directory of the task can be used to distribute native libraries and load them. + +Private and Public DistributedCache Files +DistributedCache files can be private or public, that determines how they can be shared on the slave nodes. + +"Private" DistributedCache files are cached in a local directory private to the user whose jobs need these files. These files are shared by all tasks and jobs of the specific user only and cannot be accessed by jobs of other users on the slaves. A DistributedCache file becomes private by virtue of its permissions on the file system where the files are uploaded, typically HDFS. If the file has no world readable access, or if the directory path leading to the file has no world executable access for lookup, then the file becomes private. +"Public" DistributedCache files are cached in a global directory and the file access is setup such that they are publicly visible to all users. These files can be shared by tasks and jobs of all users on the slaves. A DistributedCache file becomes public by virtue of its permissions on the file system where the files are uploaded, typically HDFS. If the file has world readable access, AND if the directory path leading to the file has world executable access for lookup, then the file becomes public. In other words, if the user intends to make a file publicly available to all users, the file permissions must be set to be world readable, and the directory permissions on the path leading to the file must be world executable. +Tool +The Tool interface supports the handling of generic Hadoop command-line options. + +Tool is the standard for any MapReduce tool or application. The application should delegate the handling of standard command-line options to GenericOptionsParser via ToolRunner.run(Tool, String[]) and only handle its custom arguments. + +The generic Hadoop command-line options are: +-conf +-D +-fs +-jt + +IsolationRunner +IsolationRunner is a utility to help debug MapReduce programs. + +To use the IsolationRunner, first set keep.failed.task.files to true (also see keep.task.files.pattern). + +Next, go to the node on which the failed task ran and go to the TaskTracker's local directory and run the IsolationRunner: +$ cd /taskTracker/${taskid}/work +$ bin/hadoop org.apache.hadoop.mapred.IsolationRunner ../job.xml + +IsolationRunner will run the failed task in a single jvm, which can be in the debugger, over precisely the same input. + +Note that currently IsolationRunner will only re-run map tasks. + +Profiling +Profiling is a utility to get a representative (2 or 3) sample of built-in java profiler for a sample of maps and reduces. + +User can specify whether the system should collect profiler information for some of the tasks in the job by setting the configuration property mapred.task.profile. The value can be set using the api JobConf.setProfileEnabled(boolean). If the value is set true, the task profiling is enabled. The profiler information is stored in the user log directory. By default, profiling is not enabled for the job. + +Once user configures that profiling is needed, she/he can use the configuration property mapred.task.profile.{maps|reduces} to set the ranges of MapReduce tasks to profile. The value can be set using the api JobConf.setProfileTaskRange(boolean,String). By default, the specified range is 0-2. + +User can also specify the profiler configuration arguments by setting the configuration property mapred.task.profile.params. The value can be specified using the api JobConf.setProfileParams(String). If the string contains a %s, it will be replaced with the name of the profiling output file when the task runs. These parameters are passed to the task child JVM on the command line. The default value for the profiling parameters is -agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s + +Debugging +The MapReduce framework provides a facility to run user-provided scripts for debugging. When a MapReduce task fails, a user can run a debug script, to process task logs for example. The script is given access to the task's stdout and stderr outputs, syslog and jobconf. The output from the debug script's stdout and stderr is displayed on the console diagnostics and also as part of the job UI. + +In the following sections we discuss how to submit a debug script with a job. The script file needs to be distributed and submitted to the framework. + +How to distribute the script file: +The user needs to use DistributedCache to distribute and symlink the script file. + +How to submit the script: +A quick way to submit the debug script is to set values for the properties mapred.map.task.debug.script and mapred.reduce.task.debug.script, for debugging map and reduce tasks respectively. These properties can also be set by using APIs JobConf.setMapDebugScript(String) and JobConf.setReduceDebugScript(String) . In streaming mode, a debug script can be submitted with the command-line options -mapdebug and -reducedebug, for debugging map and reduce tasks respectively. + +The arguments to the script are the task's stdout, stderr, syslog and jobconf files. The debug command, run on the node where the MapReduce task failed, is: +$script $stdout $stderr $syslog $jobconf + +Pipes programs have the c++ program name as a fifth argument for the command. Thus for the pipes programs the command is +$script $stdout $stderr $syslog $jobconf $program + +Default Behavior: +For pipes, a default script is run to process core dumps under gdb, prints stack trace and gives info about running threads. + +JobControl +JobControl is a utility which encapsulates a set of MapReduce jobs and their dependencies. + +Data Compression +Hadoop MapReduce provides facilities for the application-writer to specify compression for both intermediate map-outputs and the job-outputs i.e. output of the reduces. It also comes bundled with CompressionCodec implementation for the zlib compression algorithm. The gzip file format is also supported. + +Hadoop also provides native implementations of the above compression codecs for reasons of both performance (zlib) and non-availability of Java libraries. More details on their usage and availability are available here. + +Intermediate Outputs +Applications can control compression of intermediate map-outputs via the JobConf.setCompressMapOutput(boolean) api and the CompressionCodec to be used via the JobConf.setMapOutputCompressorClass(Class) api. + +Job Outputs +Applications can control compression of job-outputs via the FileOutputFormat.setCompressOutput(JobConf, boolean) api and the CompressionCodec to be used can be specified via the FileOutputFormat.setOutputCompressorClass(JobConf, Class) api. + +If the job outputs are to be stored in the SequenceFileOutputFormat, the required SequenceFile.CompressionType (i.e. RECORD / BLOCK - defaults to RECORD) can be specified via the SequenceFileOutputFormat.setOutputCompressionType(JobConf, SequenceFile.CompressionType) api. + +Skipping Bad Records +Hadoop provides an option where a certain set of bad input records can be skipped when processing map inputs. Applications can control this feature through the SkipBadRecords class. + +This feature can be used when map tasks crash deterministically on certain input. This usually happens due to bugs in the map function. Usually, the user would have to fix these bugs. This is, however, not possible sometimes. The bug may be in third party libraries, for example, for which the source code is not available. In such cases, the task never completes successfully even after multiple attempts, and the job fails. With this feature, only a small portion of data surrounding the bad records is lost, which may be acceptable for some applications (those performing statistical analysis on very large data, for example). + +By default this feature is disabled. For enabling it, refer to SkipBadRecords.setMapperMaxSkipRecords(Configuration, long) and SkipBadRecords.setReducerMaxSkipGroups(Configuration, long). + +With this feature enabled, the framework gets into 'skipping mode' after a certain number of map failures. For more details, see SkipBadRecords.setAttemptsToStartSkipping(Configuration, int). In 'skipping mode', map tasks maintain the range of records being processed. To do this, the framework relies on the processed record counter. See SkipBadRecords.COUNTER_MAP_PROCESSED_RECORDS and SkipBadRecords.COUNTER_REDUCE_PROCESSED_GROUPS. This counter enables the framework to know how many records have been processed successfully, and hence, what record range caused a task to crash. On further attempts, this range of records is skipped. + +The number of records skipped depends on how frequently the processed record counter is incremented by the application. It is recommended that this counter be incremented after every record is processed. This may not be possible in some applications that typically batch their processing. In such cases, the framework may skip additional records surrounding the bad record. Users can control the number of skipped records through SkipBadRecords.setMapperMaxSkipRecords(Configuration, long) and SkipBadRecords.setReducerMaxSkipGroups(Configuration, long). The framework tries to narrow the range of skipped records using a binary search-like approach. The skipped range is divided into two halves and only one half gets executed. On subsequent failures, the framework figures out which half contains bad records. A task will be re-executed till the acceptable skipped value is met or all task attempts are exhausted. To increase the number of task attempts, use JobConf.setMaxMapAttempts(int) and JobConf.setMaxReduceAttempts(int). + +Skipped records are written to HDFS in the sequence file format, for later analysis. The location can be changed through SkipBadRecords.setSkipOutputPath(JobConf, Path). + +Example: WordCount v2.0 +Here is a more complete WordCount which uses many of the features provided by the MapReduce framework we discussed so far. + +This needs the HDFS to be up and running, especially for the DistributedCache-related features. Hence it only works with a pseudo-distributed or fully-distributed Hadoop installation. + +Source Code +WordCount.java +1. package org.myorg; +2. +3. import java.io.*; +4. import java.util.*; +5. +6. import org.apache.hadoop.fs.Path; +7. import org.apache.hadoop.filecache.DistributedCache; +8. import org.apache.hadoop.conf.*; +9. import org.apache.hadoop.io.*; +10. import org.apache.hadoop.mapred.*; +11. import org.apache.hadoop.util.*; +12. +13. public class WordCount extends Configured implements Tool { +14. +15. public static class Map extends MapReduceBase implements Mapper { +16. +17. static enum Counters { INPUT_WORDS } +18. +19. private final static IntWritable one = new IntWritable(1); +20. private Text word = new Text(); +21. +22. private boolean caseSensitive = true; +23. private Set patternsToSkip = new HashSet(); +24. +25. private long numRecords = 0; +26. private String inputFile; +27. +28. public void configure(JobConf job) { +29. caseSensitive = job.getBoolean("wordcount.case.sensitive", true); +30. inputFile = job.get("map.input.file"); +31. +32. if (job.getBoolean("wordcount.skip.patterns", false)) { +33. Path[] patternsFiles = new Path[0]; +34. try { +35. patternsFiles = DistributedCache.getLocalCacheFiles(job); +36. } catch (IOException ioe) { +37. System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe)); +38. } +39. for (Path patternsFile : patternsFiles) { +40. parseSkipFile(patternsFile); +41. } +42. } +43. } +44. +45. private void parseSkipFile(Path patternsFile) { +46. try { +47. BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString())); +48. String pattern = null; +49. while ((pattern = fis.readLine()) != null) { +50. patternsToSkip.add(pattern); +51. } +52. } catch (IOException ioe) { +53. System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe)); +54. } +55. } +56. +57. public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException { +58. String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase(); +59. +60. for (String pattern : patternsToSkip) { +61. line = line.replaceAll(pattern, ""); +62. } +63. +64. StringTokenizer tokenizer = new StringTokenizer(line); +65. while (tokenizer.hasMoreTokens()) { +66. word.set(tokenizer.nextToken()); +67. output.collect(word, one); +68. reporter.incrCounter(Counters.INPUT_WORDS, 1); +69. } +70. +71. if ((++numRecords % 100) == 0) { +72. reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile); +73. } +74. } +75. } +76. +77. public static class Reduce extends MapReduceBase implements Reducer { +78. public void reduce(Text key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { +79. int sum = 0; +80. while (values.hasNext()) { +81. sum += values.next().get(); +82. } +83. output.collect(key, new IntWritable(sum)); +84. } +85. } +86. +87. public int run(String[] args) throws Exception { +88. JobConf conf = new JobConf(getConf(), WordCount.class); +89. conf.setJobName("wordcount"); +90. +91. conf.setOutputKeyClass(Text.class); +92. conf.setOutputValueClass(IntWritable.class); +93. +94. conf.setMapperClass(Map.class); +95. conf.setCombinerClass(Reduce.class); +96. conf.setReducerClass(Reduce.class); +97. +98. conf.setInputFormat(TextInputFormat.class); +99. conf.setOutputFormat(TextOutputFormat.class); +100. +101. List other_args = new ArrayList(); +102. for (int i=0; i < args.length; ++i) { +103. if ("-skip".equals(args[i])) { +104. DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf); +105. conf.setBoolean("wordcount.skip.patterns", true); +106. } else { +107. other_args.add(args[i]); +108. } +109. } +110. +111. FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); +112. FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); +113. +114. JobClient.runJob(conf); +115. return 0; +116. } +117. +118. public static void main(String[] args) throws Exception { +119. int res = ToolRunner.run(new Configuration(), new WordCount(), args); +120. System.exit(res); +121. } +122. } +123. +Sample Runs +Sample text-files as input: + +$ bin/hadoop dfs -ls /usr/joe/wordcount/input/ +/usr/joe/wordcount/input/file01 +/usr/joe/wordcount/input/file02 + +$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01 +Hello World, Bye World! + +$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02 +Hello Hadoop, Goodbye to hadoop. + +Run the application: + +$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount /usr/joe/wordcount/input /usr/joe/wordcount/output + +Output: + +$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 +Bye 1 +Goodbye 1 +Hadoop, 1 +Hello 2 +World! 1 +World, 1 +hadoop. 1 +to 1 +Notice that the inputs differ from the first version we looked at, and how they affect the outputs. + +Now, lets plug-in a pattern-file which lists the word-patterns to be ignored, via the DistributedCache. + +$ hadoop dfs -cat /user/joe/wordcount/patterns.txt +\. +\, +\! +to +Run it again, this time with more options: + +$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=true /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt + +As expected, the output: + +$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 +Bye 1 +Goodbye 1 +Hadoop 1 +Hello 2 +World 2 +hadoop 1 +Run it once more, this time switch-off case-sensitivity: + +$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=false /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt + +Sure enough, the output: + +$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000 +bye 1 +goodbye 1 +hadoop 2 +hello 2 +world 2 +Highlights +The second version of WordCount improves upon the previous one by using some features offered by the MapReduce framework: + +Demonstrates how applications can access configuration parameters in the configure method of the Mapper (and Reducer) implementations (lines 28-43). +Demonstrates how the DistributedCache can be used to distribute read-only data needed by the jobs. Here it allows the user to specify word-patterns to skip while counting (line 104). +Demonstrates the utility of the Tool interface and the GenericOptionsParser to handle generic Hadoop command-line options (lines 87-116, 119). +Demonstrates how applications can use Counters (line 68) and how they can set application-specific status information via the Reporter instance passed to the map (and reduce) method (line 72). +Java and JNI are trademarks or registered trademarks of Sun Microsystems, Inc. in the United States and other countries. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..9bf78621edcfd8628be103b9c09e30720e6bbebd Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Tutorial.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt new file mode 100644 index 0000000000000000000000000000000000000000..8781313caf4876561408b0c880c13f81b4cba1cf --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt @@ -0,0 +1,48 @@ +MapReduce in NCache allows developers to process huge amounts of unstructured data in parallel across an NCache cluster. To distribute input data and analyze it in parallel, MapReduce operates in parallel on all nodes in a cluster of any size. + +MapReduce is a programming model for processing and generating large data sets with a parallel, distributed algorithm on a cluster. The term “MapReduce” refers to two distinct phases. The first phase is ‘Map’ phase, which takes a set of data and converts it into another set of data, where individual items are broken down into key-value pairs. The second phase is ‘Reduce’ phase, which takes output from ‘Map’ as an input and reduces that data set into a smaller and more meaningful data set. + +A user defined Mapper processes a key-value pair to generate a set of intermediate key-value pairs. Reducer processes all those intermediate key-value pairs (having same intermediate key) to aggregate, perform calculations or any other operation on the pairs. Another optional component, Combiner, performs merging of the intermediate key-value pairs generated by Mapper before these key-value pairs can be sent over to the Reducer. + +The following example illustrates a MapReduce task (with and without combiner) being executed over a cluster of three nodes. The task takes orders as an input to the Mapper and extracts count of products consumed in it. In figure 1, Mapper’s output is directly sent to the reducer and is being aggregated on Reducer’s node whereas in figure 2, count over a single node is aggregated first and this aggregated count is sent to the Reducer node for final aggregation. + +MapReduce without Combiner: + +MapReduce in Ncache without Combiner + +MapReduce with Combiner: + +MapReduce in Ncache with Combiner + +How does MapReduce Work? +Generally, MapReduce consists of two (sometimes three) phases: i.e. Mapping, Combining (optional) and Reducing. + +Mapping phase: Filters and prepares the input for the next phase that may be Combining or Reducing. +Reduction phase: Takes care of the aggregation and compilation of the final result. +Combining phase: Responsible for reduction local to the node, before sending the input to the Reducers. Combine phase optimizes performance as it minimizes the network traffic between Mapper and Reducers by sending the output to the Reducer in chunks. +Similarly, NCache MapReduce has three phases: Map, Combine, and Reduce. Only the Mapper is necessary to implement, Reducer and Combiner implementations are optional. NCache MapReduce will execute its default reducer if the user does not implement Reducer. Default reducer merges output omitted by Mapper into an array. + +The Mapper, Combiner and Reducer are executed simultaneously during an NCache MapReduce task on the NCache cluster. Mapper output is individually sent to the Combiner. When Combiner’s output reaches the specified chunk size, it is then sent to the Reducer, which finalizes and persists the output. + +In order to monitor the submitted task, a traceable object is provided to the user. + +Number of tasks to be executed simultaneously and Mapper’s output chunk is configurable. Mapper’s output is sent to combiner or reducer once output chunk reaches the configured chunk size. See NCache Administrator’s Guide. + +A typical MapReduce task has the following components: + +Mapper: Processes the initial input and enables user to emit the output into a dictionary to be used as an input for the combiner or reducer. + +Combiner Factory: creates and manages combiners for each key emitted into output by the mapper. + +Combiner: Works as local reducer to the node where Mapper’s output is combined to minimize traffic between Mapper and Reducer. + +Reducer Factory: create and manages reducers for each key emitted into output by the mapper or combiner. + +Reducer: Processes all those intermediate key-value pairs generated by Mapper or combined by Combiner to aggregate, perform calculations or apply different operations to produce the reduced output. + +Key Filter: Key Filter, as the name indicates, allows the user to filter cache data based on its keys before sent to the Mapper. The KeyFilter is called during Mapper phase. If it returns true, the Map will be executed on the key. If it returns false, Mapper will skip the key and move to next one from the Cache. + +TrackerTask: This component lets you keep track of the progress of the task and its status as the task is executed. And lets you fetch the output of the task and enumerate it. + +Output: The output is stored in-memory, on the server side. It can be enumerated using the TrackableTask instance on the client application. + diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..f6609e6302a873309ff95de23a68c65af5124b9f Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce Working and Components.txt.xml.xls differ diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" new file mode 100644 index 0000000000000000000000000000000000000000..084bf15f448b467539bda8b7ed81c110215c9f30 --- /dev/null +++ "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt" @@ -0,0 +1,24 @@ +MapReduce – Components: +Split + +Split can be called as a logical representation of block. In map-reduce 1 mapper can process 1 split at a time. + +We have seen in HDFS that the default size can be 64mb or 128mb, then if file size is 1280mb, block size is 128mb than we will have 10 splits, then 10 mappers will run for the input file. + +Job + +We have seen in the HDFS that we have a client, master and slaves. Client configures the job and submits it to the master. We can say job as a program in which we execute mapper and reducer. + +Task + +We can say Task as a sub-division of job. Here job is divided into smaller tasks. Master divides the work or job into multiple tasks and gives them to slaves. The actual work is done by the slaves. + +Here we can also say that Client need to submit the job to Resource Manger which is running on Master, then Master converts that job and submits tasks to the slaves. These all tasks are run parallel and independent on each other. + +Resource Manager + +It is a daemon which runs on Master node. + +Node Manager + +It is a daemon which runs on Slaves. \ No newline at end of file diff --git "a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" new file mode 100644 index 0000000000000000000000000000000000000000..3cc9106b1e3ba326be2ace1739b783522c4ab8f1 Binary files /dev/null and "b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce \342\200\223 Components.txt.xml.xls" differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt new file mode 100644 index 0000000000000000000000000000000000000000..0722022c07ae5a5942421179c6fbc7fbdd33e1a6 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt @@ -0,0 +1,39 @@ +MapReduce +Stephen J. Bigelow +By +Stephen J. Bigelow, Senior Technology Editor +MapReduce is a core component of the Apache Hadoop software framework. + +Hadoop enables resilient, distributed processing of massive unstructured data sets across commodity computer clusters, in which each node of the cluster includes its own storage. MapReduce serves two essential functions: it filters and parcels out work to various nodes within the cluster or map, a function sometimes referred to as the mapper, and it organizes and reduces the results from each node into a cohesive answer to a query, referred to as the reducer. + +How MapReduce works +The original version of MapReduce involved several component daemons, including: + +JobTracker -- the master node that manages all the jobs and resources in a cluster; +TaskTrackers -- agents deployed to each machine in the cluster to run the map and reduce tasks; and +JobHistory Server -- a component that tracks completed jobs and is typically deployed as a separate function or with JobTracker. +With the introduction of MapReduce and Hadoop version 2, previous JobTracker and TaskTracker daemons have been replaced with components of Yet Another Resource Negotiator (YARN), called ResourceManager and NodeManager. + +ResourceManager runs on a master node and handles the submission and scheduling of jobs on the cluster. It also monitors jobs and allocates resources. +NodeManager runs on slave nodes and interoperates with Resource Manager to run tasks and track resource usage. NodeManager can employ other daemons to assist with task execution on the slave node. +To distribute input data and collate results, MapReduce operates in parallel across massive cluster sizes. Because cluster size doesn't affect a processing job's final results, jobs can be split across almost any number of servers. Therefore, MapReduce and the overall Hadoop framework simplify software development. + +MapReduce is available in several languages, including C, C++, Java, Ruby, Perl and Python. Programmers can use MapReduce libraries to create tasks without dealing with communication or coordination between nodes. + +MapReduce is also fault-tolerant, with each node periodically reporting its status to a master node. If a node doesn't respond as expected, the master node reassigns that piece of the job to other available nodes in the cluster. This creates resiliency and makes it practical for MapReduce to run on inexpensive commodity servers. + +MapReduce examples and uses +The power of MapReduce is in its ability to tackle huge data sets by distributing processing across many nodes, and then combining or reducing the results of those nodes. + +As a basic example, users could list and count the number of times every word appears in a novel as a single server application, but that is time-consuming. By contrast, users can split the task among 26 people, so each takes a page, writes a word on a separate sheet of paper and takes a new page when they're finished. This is the map aspect of MapReduce. And if a person leaves, another person takes his or her place. This exemplifies MapReduce's fault-tolerant element. + +When all the pages are processed, users sort their single-word pages into 26 boxes, which represent the first letter of each word. Each user takes a box and sorts each word in the stack alphabetically. The number of pages with the same word is an example of the reduce aspect of MapReduce. + +There is a broad range of real-world uses for MapReduce involving complex and seemingly unrelated data sets. For example, a social networking site could use MapReduce to determine users' potential friends, colleagues and other contacts based on site activity, names, locations, employers and many other data elements. A booking website could use MapReduce to examine the search criteria and historical behaviors of users, and can create customized offerings for each. An industrial facility could collect equipment data from different sensors across the installation and use MapReduce to tailor maintenance schedules or predict equipment failures to improve overall uptime and cost-savings. + +MapReduce services and alternatives +One challenge with MapReduce is the infrastructure it requires to run. Many businesses that could benefit from big data tasks can't sustain the capital and overhead needed for such an infrastructure. As a result, some organizations rely on public cloud services for Hadoop and MapReduce, which offer enormous scalability with minimal capital costs or maintenance overhead. + +For example, Amazon Web Services (AWS) provides Hadoop as a service through its Amazon Elastic MapReduce (EMR) offering. Microsoft Azure offers its HDInsight service, which enables users to provision Hadoop, Apache Spark and other clusters for data processing tasks. Google Cloud Platform provides its Cloud Dataproc service to run Spark and Hadoop clusters. + +For organizations that prefer to build and maintain private, on-premises big data infrastructures, Hadoop and MapReduce represent only one option. Organizations can opt to deploy other platforms, such as Apache Spark, High-Performance Computing Cluster and Hydra. The big data framework an enterprise chooses will depend on the types of processing tasks required, supported programming languages, and performance and infrastructure demands. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..be8f637081ad7e9390a5aa95e0570957a2922a26 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/MapReduce.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt new file mode 100644 index 0000000000000000000000000000000000000000..43986d403ef2ae475090aea4195357c012ddb377 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt @@ -0,0 +1,178 @@ +select_all Edge AppSpace +SolutionHub +Performance / CDN +Security +Virtual Waiting Room +A/B Testing +Search AppSpace +AppStack +Node.js Edge Hosting +RunStack +Containers +Serverless +gps_fixed Core Platform +Section Control Plane +Edge AppSpace +Adaptive Edge Engine (AEE) +Global Edge Network +Solutions +SaaS +PaaS & Hosting Providers +Edge App Hosting +Docs +Resources +Blog +Case Studies +Edge Content Library +Solution Briefs +Product Videos +Engineering Education +About Section +Partners +Changelog +Pricing +Contact Log In Get Started +Platform +select_allEdge AppSpace +SolutionHub +Performance / CDN +Security +Virtual Waiting Room +A/B Testing +AppStack +Node.js Edge Hosting +RunStack +Containers +Serverless +Search AppSpace +gps_fixedCore Platform +Section Control Plane +Edge AppSpace +Adaptive Edge Engine (AEE) +Global Edge Network +Docs +Resources +Blog +Case Studies +Content Library +Solution Briefs +Changelog +Engineering Education +Partners +About Section +Pricing +Contact +Log In +Get Started +Understanding MapReduce in Hadoop +December 6, 2020 +FacebookTwitterLinkedIn +MapReduce is a component of the Apache Hadoop ecosystem, a framework that enhances massive data processing. Other components of Apache Hadoop include Hadoop Distributed File System (HDFS), Yarn, and Apache Pig. + +The MapReduce component enhances the processing of massive data using dispersed and parallel algorithms in the Hadoop ecosystem. This programming model is applied in social platforms and e-commerce to analyze huge data collected from online users. + +This article provides an understanding of MapReduce in Hadoop. It will enable readers to gain insights on how vast volumes of data is simplified and how MapReduce is used in real-life applications. + +Introduction to MapReduce in Hadoop +MapReduce is a Hadoop framework used for writing applications that can process vast amounts of data on large clusters. It can also be called a programming model in which we can process large datasets across computer clusters. This application allows data to be stored in a distributed form. It simplifies enormous volumes of data and large scale computing. + +There are two primary tasks in MapReduce: map and reduce. We perform the former task before the latter. In the map job, we split the input dataset into chunks. Map task processes these chunks in parallell. The map we use outputs as inputs for the reduce tasks. Reducers process the intermediate data from the maps into smaller tuples, that reduces the tasks, leading to the final output of the framework. + +The MapReduce framework enhances the scheduling and monitoring of tasks. The failed tasks are re-executed by the framework. This framework can be used easily, even by programmers with little expertise in distributed processing. MapReduce can be implemented using various programming languages such as Java, Hive, Pig, Scala, and Python. + +How MapReduce in Hadoop works +An overview of MapReduce Architecture and MapReduce’s phases will help us understand how MapReduce in Hadoop works. + +MapReduce architecture +The following diagram shows a MapReduce architecture. + +MapReduce Architecture + +Image Source: A4Academics + +MapReduce architecture consists of various components. A brief description of these components can improve our understanding on how MapReduce works. + +Job: This is the actual work that needs to be executed or processed +Task: This is a piece of the actual work that needs to be executed or processed. A MapReduce job comprises many small tasks that need to be executed. +Job Tracker: This tracker plays the role of scheduling jobs and tracking all jobs assigned to the task tracker. +Task Tracker: This tracker plays the role of tracking tasks and reporting the status of tasks to the job tracker. +Input data: This is the data used to process in the mapping phase. +Output data: This is the result of mapping and reducing. +Client: This is a program or Application Programming Interface (API) that submits jobs to the MapReduce. MapReduce can accept jobs from many clients. +Hadoop MapReduce Master: This plays the role of dividing jobs into job-parts. +Job-parts: These are sub-jobs that result from the division of the main job. +In the MapReduce architecture, clients submit jobs to the MapReduce Master. This master will then sub-divide the job into equal sub-parts. The job-parts will be used for the two main tasks in MapReduce: mapping and reducing. + +The developer will write logic that satisfies the requirements of the organization or company. The input data will be split and mapped. + +The intermediate data will then be sorted and merged. The reducer that will generate a final output stored in the HDFS will process the resulting output. + +The following diagram shows a simplified flow diagram for the MapReduce program. + +MapReduce Flow Diagram + +Image Source: Data Flair + +How job trackers and task trackers work +Every job consists of two key components: mapping task and reducing task. The map task plays the role of splitting jobs into job-parts and mapping intermediate data. The reduce task plays the role of shuffling and reducing intermediate data into smaller units. + +The job tracker acts as a master. It ensures that we execute all jobs. The job tracker schedules jobs that have been submitted by clients. It will assign jobs to task trackers. Each task tracker consists of a map task and reduces the task. Task trackers report the status of each assigned job to the job tracker. The following diagram summarizes how job trackers and task trackers work. + +Job Trackers and Task Trackers + +Image Source: CNBlogs + +Phases of MapReduce +The MapReduce program is executed in three main phases: mapping, shuffling, and reducing. There is also an optional phase known as the combiner phase. + +Mapping Phase +This is the first phase of the program. There are two steps in this phase: splitting and mapping. A dataset is split into equal units called chunks (input splits) in the splitting step. Hadoop consists of a RecordReader that uses TextInputFormat to transform input splits into key-value pairs. + +The key-value pairs are then used as inputs in the mapping step. This is the only data format that a mapper can read or understand. The mapping step contains a coding logic that is applied to these data blocks. In this step, the mapper processes the key-value pairs and produces an output of the same form (key-value pairs). + +Shuffling phase +This is the second phase that takes place after the completion of the Mapping phase. It consists of two main steps: sorting and merging. In the sorting step, the key-value pairs are sorted using the keys. Merging ensures that key-value pairs are combined. + +The shuffling phase facilitates the removal of duplicate values and the grouping of values. Different values with similar keys are grouped. The output of this phase will be keys and values, just like in the Mapping phase. + +Reducer phase +In the reducer phase, the output of the shuffling phase is used as the input. The reducer processes this input further to reduce the intermediate values into smaller values. It provides a summary of the entire dataset. The output from this phase is stored in the HDFS. + +The following diagram shows an example of a MapReduce with the three main phases. Splitting is often included in the mapping stage. + +Example of MapReduce + +Image Source: Edureka + +Combiner phase +This is an optional phase that’s used for optimizing the MapReduce process. It’s used for reducing the pap outputs at the node level. In this phase, duplicate outputs from the map outputs can be combined into a single output. The combiner phase increases speed in the Shuffling phase by improving the performance of Jobs. + +The following diagram shows how all the four phases of MapReduce have been applied. + +MapReduce with Combiner Phase + +Image Source: Cloud Front + +Benefits of Hadoop MapReduce +Speed: MapReduce can process huge unstructured data in a short time. +Fault-tolerance: The MapReduce framework can handle failures. +Cost-effective: Hadoop has a scale-out feature that enables users to process or store data in a cost-effective manner. +Scalability: Hadoop provides a highly scalable framework. MapReduce allows users to run applications from many nodes. +Data availability: Replicas of data are sent to various nodes within the network. This ensures copies of the data are available in the event of failure. +Parallel Processing: In MapReduce, multiple job-parts of the same dataset can be processed in a parallel manner. This reduces the time taken to complete a task. +Applications of Hadoop MapReduce +The following are some of the practical applications of the MapReduce program. + +E-commerce +E-commerce companies such as Walmart, E-Bay, and Amazon use MapReduce to analyze buying behavior. MapReduce provides meaningful information that is used as the basis for developing product recommendations. Some of the information used include site records, e-commerce catalogs, purchase history, and interaction logs. + +Social networks +The MapReduce programming tool can evaluate certain information on social media platforms such as Facebook, Twitter, and LinkedIn. It can evaluate important information such as who liked your status and who viewed your profile. + +Entertainment +Netflix uses MapReduce to analyze the clicks and logs of online customers. This information helps the company suggest movies based on customers’ interests and behavior. + +Conclusion +MapReduce is a crucial processing component of the Hadoop framework. It’s a quick, scalable, and cost-effective program that can help data analysts and developers process huge data. + +This programming model is a suitable tool for analyzing usage patterns on websites and e-commerce platforms. Companies providing online services can utilize this framework to improve their marketing strategies. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..a5b6491de17c059764cf9ae23f2108a1ecd86f2c Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/Understanding MapReduce in Hadoop.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt new file mode 100644 index 0000000000000000000000000000000000000000..62946b28397f9d778aefc24118fb724b5c6c1c1d --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt @@ -0,0 +1,73 @@ +INTRODUCTION +MapReduce Architecture is a programming model and a software framework utilized for preparing enormous measures of data. MapReduce program works in two stages, to be specific, Map and Reduce. Map requests that arrange with mapping and splitting of data while Reduce tasks reduce and shuffle the data. + +Want To Know More About Our Programs? +Name +Email + +IN  |India (+91) ++91 +Phone + +City + +Course Interested In +By proceeding, you agree to our privacy policy and also agree to receive information from Jigsaw Academy through WhatsApp & other means of communication. + +Submit +Hadoop MapReduce Architecture is fit for running MapReduce programs written in different languages: C, Python, Ruby, and Java. The projects of MapReduce in cloud computing are equal, accordingly help to perform an enormous scope of data examination utilizing various machines in the cluster. + +MapReduce Architecture +Components of MapReduce Architecture +1. MAPREDUCE ARCHITECTURE +HDFS and MapReduce architecture are the two significant parts of Hadoop that make it so efficient and powerful to utilize. MapReduce is a programming model utilized for proficient handling in equal over huge data collections in a conveyed way. The data is first to part and afterward consolidated to deliver the eventual outcome. + +The MapReduce task is predominantly isolated into 2 phases: + +Map Phase +Reduce Phase +The libraries for MapReduce are written in so many programming languages with different diverse various improvements. The motivation behind Map Reduce in Hadoop is to Map every one of the positions, and afterward, it will decrease it to comparable undertakings for giving less overhead over the cluster network and to diminish the preparing power. + +2. COMPONENTS OF MAPREDUCE ARCHITECTURE +Components of MapReduce Architecture are: +Client +Job +Hadoop MapReduce Master +Job Parts +Input Data +Output Data +Client: The MapReduce client is the person who carries the Job to the MapReduce for preparing. There can be numerous clients accessible that persistently send works for preparing to the Hadoop MapReduce Manager. +Job: The MapReduce Job is the real work that the customer needed to do which is included such countless more modest errands that the customer needs to execute or process. +Hadoop MapReduce Master: It separates the specific occupation into resulting position parts. +Job Parts: The sub-jobs or tasks that are acquired in the wake of isolating the primary work. The aftereffect of all the work parts joined to deliver the last yield. +Input Data: The data index that is taken care of to the MapReduce for handling. +Output Data: The end-product is acquired after preparation. +In MapReduce Architecture, we have a client. The client will present the job of a specific size to the Hadoop MapReduce Master. Presently, the MapReduce expert will isolate this job into additional identical job parts. These job parts are then made accessible for the MapReduce Task. + +Map Reduce programming according to the necessity of the utilization case that the specific organization is tackling. The engineer composes their rationale to satisfy the prerequisite that the business requires. The input which we are utilizing is then taken care of to the Map Task, and the Map will produce moderate key-esteem pair as its yield. The output of Map, for example, these key-esteem sets, are then taken care of to the Reducer, and the last yield is put away on the HDFS- Hadoop Distributed File System. + +There can be n number of MapReduce assignments made accessible for preparing the information according to the prerequisite. The calculation for MapReduce is made in an exceptionally upgraded way to such an extent that the time intricacy or space intricacy is least. + +How about we examine the MapReduce phases to improve comprehension of its architecture: + +MapReduce Architecture is fundamentally partitioned into two phases, for example, Map phase and Reduce phase. + +Map: As the name proposes, its principle use is to plan the input data in key-esteem sets. The contribution to the map might be a key-esteem pair where the key can be the id of some sort of address, and worth is the real value that it keeps. The Map () capacity will be executed in its memory vault on every one of these input key-esteem pairs and creates the moderate key-esteem pair, which fills in as a contribution for the Reducer or Reduce () work. +Reduce: The middle of the key-esteem combines that fill in as contribution for Reducer are send and sort and shuffled off the Reduce () work. Reducer total or gathering the data-dependent on its key-esteem pair according to the reducer calculation composed by the developer. +How Task Tracker and the Job tracker manage MapReduce Architecture: +Task Tracker: It can be considered as the real slaves that are dealing with the guidance given by the Job Tracker. This Task Tracker is conveyed on every one of the nodes accessible in the cluster that executes the MapReduce task as taught by Job Tracker. +Job Tracker: It is to deal with all the jobs and all the resources across the cluster and to plan each guide on the Task Tracker running on a similar information hub since there can be many data nodes accessible in the cluster. +There is additionally one significant segment of MapReduce Architecture known as Job History Server. The Job History Server is a daemon cycle that recoveries and stores authentic data about the application or task, similar to the logs which are created after or during the work execution are put away on Job History Server. + +Hadoop MapReduce architecture presently has become a famous solution for the present world necessities. The plan of Hadoop remembers different objectives. Hadoop MapReduce architecture diagram that encourages you to comprehend it better. + +Hadoop MapReduce framework architecture includes three significant layers. They are: +HDFS- Hadoop Distributed File System: NameNode and DataNode, Block in HDFS, and Replication Management. +Yarn: Scheduler, and Application Manager. +MapReduce: Map Task, and Reduce Task. +CONCLUSION +The MapReduce Architecture system works on the mind-boggling interaction of preparing monstrous information that is accessible in the Hadoop structure. There have been numerous critical changes in the MapReduce programming language. + +Hadoop is quite possibly the most well-known system to handle huge information, and extraordinary compared to other supporting squares of Hadoop is MapReduce. If you are looking for a profession as an information examiner in the data science field, at that point, you should know about this rising and well-known programming language. + +Jigsaw Academy’s Postgraduate Certificate Program In Cloud Computing brings Cloud aspirants closer to their dream jobs. The joint-certification course is 6 months long and is conducted online and will help you become a complete Cloud Professional. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..8c8c99abc15e4929bc8851db6d5d69970f488603 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What Is MapReduce Architecture An Important Overview For 2021.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt new file mode 100644 index 0000000000000000000000000000000000000000..92be416b47c9d2759c4a919055a030be99e4a497 --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt @@ -0,0 +1,6 @@ +What are the components of MapReduce +JobTracker and TaskTracker are the main components of the mapreduce +Job Tracker +Job Tracker is a master which creates and runs the job. JobTracker that runs on name node, allocates the job to TaskTrackers. +TaskTracker +TaskTracker is a slave and runs on data node. TaskTracker runs the tasks and reports the status of task to JobTracker. \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..098b139dbcddcf69f1a0a9272e7a122d3f3f9eb1 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/What are the components of MapReduce.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt new file mode 100644 index 0000000000000000000000000000000000000000..52ab29a9ab6190757983ca0211695d08fec6b49e --- /dev/null +++ b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt @@ -0,0 +1,1407 @@ +MapReduce & Hadoop II +Mingshen Sun +The Chinese University of Hong Kong +mssun@cse.cuhk.edu.hk +Mingshen Sun (CUHK) MapReduce & Hadoop +Outline +• MapReduce Recap +• Design patterns +• in-mapper combing +• pairs and stripes +• order inversion +• value-to-key conversion +2 +Mingshen Sun (CUHK) MapReduce & Hadoop +MapReduce Recap +• Input and output: each a set of key/value pairs. +• Tow functions implemented by users. +• Map (k1, v1) -> list(k2, v2) +• takes an input key/value pair +• produces a set of intermediate key/value pairs +• Reduce (k2, list(v2)) -> list(k3, v3) +• takes a set of values for an intermediate key +• produces a set of output value +• MapReduce framework guarantees that all values associated with +the same key are brought together in the reducer +3 +Mingshen Sun (CUHK) MapReduce & Hadoop +MapReduce Recap +• Optional functions: +• Partition (k’, number of partitions) -> +partition for k’ +• dividing up the intermediate key space and assigning intermediate +key-value pairs to reducers +• often a simple hash of the key, e.g., hash(k’) mod n +• Combine (k2, list(v2)) -> list(k2’, v2’) +• mini-reducers that run in memory after the map phase +• used as an optimization to reduce network traffic +• will be discuss later +4 +Mingshen Sun (CUHK) MapReduce & Hadoop +MapReduce Recap +5 +30 CHAPTER 2. MAPREDUCE BASICS +A α B β C γ D δ E ε F ζ +mapper mapper mapper mapper +a 1 b 2 c 3 c 6 a 5 c 2 b 7 c 8 +combiner combiner combiner combiner +pp pp pp pp +a 1 b 2 c 9 a 5 c 2 b 7 c 8 +partitioner partitioner partitioner partitioner +Shuffle and Sort: aggregate values by keys +a 1 5 b 2 7 c 2 9 8 +p p p p +reducer reducer reducer +X 5 Y 7 Z 9 +Figure 2.4: Complete view of MapReduce, illustrating combiners and partitioners in addition +Mingshen Sun (CUHK) MapReduce & Hadoop +Goals +• Key question: MapReduce provides an elegant +programming model, but how should we recast a multitude +of algorithms into the MapReduce model? +• Goal of this lecture: provide a guide to MapReduce +algorithm design: +• design patterns, which form the building blocks of may problems +6 +Mingshen Sun (CUHK) MapReduce & Hadoop +Challenges +• MapReduce execution framework handles most complicated +details +• e.g., copy intermediate key-value pairs from mappers to reducers +grouped by key during the shuffle and sort stage +• Programmers have little control over MapReduce execution: +• Where a mapper or reducer runs +• When a mapper or reduce begins or finishes +• Which input key-value pairs are processed by a specific mapper +• Which intermediate key-value pairs are processed by a specific +reducer +7 +Mingshen Sun (CUHK) MapReduce & Hadoop +Challenges +• Things that programmers can control: +• Construct complex data structures as keys and +values to store and communicate partial results +• Execute user-specified initialization/termination code in a map or +reduce task +• Preserve state in both mappers and reducers across multiple input +or intermediate keys +• Control sort order of intermediate keys, and hence the order of how +a reducer processes keys +• Control partitioning of key space, and hence the set of keys +encountered by a reducer +8 +Mingshen Sun (CUHK) MapReduce & Hadoop +Challenges +• What we really want… +• No inherent bottlenecks as algorithms are applied to +increasingly large datasets +• linear scalability: an algorithm running on twice the amount of data +should take only twice as long +• an algorithm running on twice the number of nodes should only take +half as long +9 +Mingshen Sun (CUHK) MapReduce & Hadoop +Design Patterns +• Combiners and in-mapper combining +• aggregate map outputs to reduce data traffic being shuffled from +mappers to reducers +• Paris and stripes +• keep track of joint events +• Order inversion +• sort and control the sequence of computation +• Value-to-key conversion +• allow secondary sorting +10 +Mingshen Sun (CUHK) MapReduce & Hadoop +Local Aggregation +• In Hadoop, intermediate results (i.e., map outputs) are +written to local disk before being sent over the network +• network and disk latencies are expensive +• Local aggregation of intermediate results reduces the +number of key-value pairs that need to be shuffled from the +mappers to the reducers +• Default combiner: +• provided by the MapReduce framework +• aggregate map outputs with the same key +• acts like a mini-reducer +11 +Mingshen Sun (CUHK) MapReduce & Hadoop +Word Count: Baseline +• What is the number of records being shuffled? +• without combiners? +• with combiners? +12 +42 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN +1: class Mapper +2: method Map(docid a, doc d) +3: for all term t 2 doc d do +4: Emit(term t, count 1) +1: class Reducer +2: method Reduce(term t, counts [c1, c2, . . .]) +3: sum 0 +4: for all count c 2 counts [c1, c2, . . .] do +5: sum sum + c +6: Emit(term t, count sum) +Figure 3.1: Pseudo-code for the basic word count algorithm in MapReduce (repeated from +Figure 2.3). +The first technique for local aggregation is the combiner, already discussed Section 2.4. Combiners provide a general mechanism within the MapReduce framework +to reduce the amount of intermediate data generated by the mappers—recall that they +can be understood as “mini-reducers” that process the output of mappers. In this +Mingshen Sun (CUHK) MapReduce & Hadoop +Implementation in Hadoop +public class WordCount { +} +13 +public static class TokenizerMapper +extends Mapper{ +private final static IntWritable one = new IntWritable(1); +private Text word = new Text(); +public void map(Object key, Text value, Context context +) throws IOException, InterruptedException { +StringTokenizer itr = new StringTokenizer(value.toString()); +while (itr.hasMoreTokens()) { +word.set(itr.nextToken()); +context.write(word, one); +} +} +} +Mingshen Sun (CUHK) MapReduce & Hadoop +Implementation in Hadoop +public class WordCount { +} +14 +public static class IntSumReducer +extends Reducer { +private IntWritable result = new IntWritable(); +public void reduce(Text key, Iterable values, +Context context +) throws IOException, InterruptedException { +int sum = 0; +for (IntWritable val : values) { +sum += val.get(); +} +result.set(sum); +context.write(key, result); +} +} +Mingshen Sun (CUHK) MapReduce & Hadoop +Implementation in Hadoop +public class WordCount { +} +15 +public static void main(String[] args) throws Exception { +Configuration conf = new Configuration(); +Job job = Job.getInstance(conf, "word count"); +job.setJarByClass(WordCount.class); +job.setMapperClass(TokenizerMapper.class); +job.setCombinerClass(IntSumReducer.class); +job.setReducerClass(IntSumReducer.class); +job.setOutputKeyClass(Text.class); +job.setOutputValueClass(IntWritable.class); +FileInputFormat.addInputPath(job, new Path(args[0])); +FileOutputFormat.setOutputPath(job, new Path(args[1])); +System.exit(job.waitForCompletion(true) ? 0 : 1); +} +Mingshen Sun (CUHK) MapReduce & Hadoop +Usage +• Environment +• Compile & Package +• Run +16 +export JAVA_HOME=/usr/java/default +export PATH=$JAVA_HOME/bin:$PATH +export HADOOP_CLASSPATH=$JAVA_HOME/lib/tools.jar +$ bin/hadoop com.sun.tools.javac.Main WordCount.java +$ jar cf wc.jar WordCount*.class +$ bin/hadoop jar wc.jar WordCount /user/joe/wordcount/ +input /user/joe/wordcount/output +Mingshen Sun (CUHK) MapReduce & Hadoop +Word Count: Version 1 +• in-mapper combining +• emits a key-value pair for each unique term per document +17 +3.1. LOCAL AGGREGATION 1: class Mapper +2: method Map(docid a, doc d) +3: H new AssociativeArray +4: for all term t 2 doc d do +5: H{t} H{t} + 1 . Tally counts for entire document +6: for all term t 2 H do +7: Emit(term t, count H{t}) +Figure 3.2: Pseudo-code for the improved MapReduce word count algorithm that associative array to aggregate term counts on a per-document basis. Reducer is the same Figure 3.1. +This basic idea can be taken one step further, as illustrated in the variant word count algorithm in Figure 3.3 (once again, only the mapper is modified). workings of this algorithm critically depends on the details of how map and tasks in Hadoop are executed, discussed in Section 2.6. Recall, a (Java) mapper is created for each map task, which is responsible for processing a block of input counts for entire document +Mingshen Sun (CUHK) MapReduce & Hadoop +Word Count: Version 2 +• in-mapper combining +• recall a map object is created for each map task +• aggregate all data appearing in the input block processed by the +map task +18 +44 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN +1: class Mapper +2: method Initialize +3: H new AssociativeArray +4: method Map(docid a, doc d) +5: for all term t 2 doc d do +6: H{t} H{t} + 1 . Tally counts across documents +7: method Close +8: for all term t 2 H do +9: Emit(term t, count H{t}) +Figure 3.3: Pseudo-code for the improved MapReduce word count algorithm that demonstrates +the “in-mapper combining” design pattern. Reducer is the same as in Figure 3.1. +For example, Hadoop makes no guarantees on how many times the combiner is applied, +or that it is even applied at all. The combiner is provided as a semantics-preserving +optimization to the execution framework, which has the option of using it, perhaps +multiple times, or not at all (or even in the reduce phase). In some cases (although in this particular example), such indeterminism is unacceptable, which is exactly counts across documents +Setup() in Java +Cleanup() in Java +Mingshen Sun (CUHK) MapReduce & Hadoop +Combiners v.s. In-Mapper Combiners +• Advantages of in-mapper combiners: +• Provide control over where and how local aggregation takes place. +In contrast, semantics of default combiners are underspecified in +MapReduce. +• In-mapper combiners are applied inside the code. Default +combiners are applied inside the map outputs (after being emitted +by the map task). +• Disadvantages: +• States are preserved within mappers -> potentially large memory +overhead. +• algorithmic behavior may depend on the order in which input keyvalue +pairs are encountered - > potential order-dependent bugs. +19 +Mingshen Sun (CUHK) MapReduce & Hadoop +Combiner Design +• Combiner and reducer must share the same signature +• combiner is treated as mini-reducer +• combiner input and output key-value types must match reducer +input key-value type +• Remember: combiner are optional optimizations +• with/without combiner should not affect algorithm correctness +• may be run 0, 1, or multiple times, determined by the MapReduce +execution framework +• In Java, you can specify the combiner class as: +• public void setCombinerClass(Class cls) +• exactly the Reducer type +20 +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing the Mean: Version 1 +• Any drawback? +• Can we use reducer as combiner? +• i.e., set combiner class to be reducer class +21 +3.1. LOCAL AGGREGATION 47 +1: class Mapper +2: method Map(string t, integer r) +3: Emit(string t, integer r) +1: class Reducer +2: method Reduce(string t, integers [r1, r2, . . .]) +3: sum 0 +4: cnt 0 +5: for all integer r 2 integers [r1, r2, . . .] do +6: sum sum + r +7: cnt cnt + 1 +8: ravg sum/cnt +9: Emit(string t, integer ravg) +Figure 3.4: Pseudo-code for the basic MapReduce algorithm that computes the mean of values +associated with the same key. +of values associated with the same key, and the reducer would compute the mean of +those values. As a concrete example, we know that: +Mean(1, 2, 3, 4, 5) 6= Mean(Mean(1, 2),Mean(3, 4, 5)) +Pseudo-code for the basic +MapReduce algorithm that +computes the mean of +values associated with the +same key. +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing the Mean: Version 1 +• Mean of the means is not the original mean. +• e.g., +• mean(1, 2, 3, 4, 5) != mean(mean(1, 2), mean(3, 4, 5)) +• It’s not a problem for Word Count problem, but it’s a +problem here. +22 +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing the Mean: Version 2 +• Does it work? Why? +• recall that combiners must have the same input and output keyvalue +type +• Why? +• combiners are optimizations that cannot change the correctness of +the algorithm +23 +48 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN +1: class Mapper +2: method Map(string t, integer r) +3: Emit(string t, integer r) +1: class Combiner +2: method Combine(string t, integers [r1, r2, . . .]) +3: sum 0 +4: cnt 0 +5: for all integer r 2 integers [r1, r2, . . .] do +6: sum sum + r +7: cnt cnt + 1 +8: Emit(string t, pair (sum, cnt)) . Separate sum and count +1: class Reducer +2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) +3: sum 0 +4: cnt 0 +5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do +6: sum sum + s +7: cnt cnt + c +8: ravg sum/cnt +9: Emit(string t, integer ravg) +Figure 3.5: Pseudo-code for an incorrect first attempt at introducing combiners to compute +the mean of values associated with each key. The mismatch between combiner input and output +key-value types violates the MapReduce programming model. +3: sum 0 +4: cnt 0 +5: for all integer r 2 integers [r1, r2, . . .] do +6: sum sum + r +7: cnt cnt + 1 +8: Emit(string t, pair (sum, cnt)) . Separate 1: class Reducer +2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) +3: sum 0 +4: cnt 0 +5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do +6: sum sum + s +7: cnt cnt + c +8: ravg sum/cnt +9: Emit(string t, integer ravg) +Figure 3.5: Pseudo-code for an incorrect first attempt at introducing the mean of values associated with each key. The mismatch between combiner key-value types violates the MapReduce programming model. +chapter. We will frequently encounter complex keys and values this book. +Unfortunately, this algorithm will not work. Recall that combiners same input and output key-value type, which also must be the output type and the reducer input type. This is clearly not the why this restriction is necessary in the programming model, remember are optimizations that cannot change the correctness of the algorithm. the combiner and see what happens: the output value type of so the reducer expects to receive a list of integers as values. But expects a list of pairs! The correctness of the algorithm is contingent running on the output of the mappers, and more specifically, that +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing the Mean: Version 3 +• Does it work? Why? +24 +3.1. LOCAL AGGREGATION 49 +1: class Mapper +2: method Map(string t, integer r) +3: Emit(string t, pair (r, 1)) +1: class Combiner +2: method Combine(string t, pairs [(s1, c1), (s2, c2) . . .]) +3: sum 0 +4: cnt 0 +5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do +6: sum sum + s +7: cnt cnt + c +8: Emit(string t, pair (sum, cnt)) +1: class Reducer +2: method Reduce(string t, pairs [(s1, c1), (s2, c2) . . .]) +3: sum 0 +4: cnt 0 +5: for all pair (s, c) 2 pairs [(s1, c1), (s2, c2) . . .] do +6: sum sum + s +7: cnt cnt + c +8: ravg sum/cnt +9: Emit(string t, integer ravg) +Figure 3.6: Pseudo-code for a MapReduce algorithm that computes the mean of values associated +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing the Mean: Version 4 +• Does it work? +• Do we need a combiner? +25 +50 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN +1: class Mapper +2: method Initialize +3: S new AssociativeArray +4: C new AssociativeArray +5: method Map(string t, integer r) +6: S{t} S{t} + r +7: C{t} C{t} + 1 +8: method Close +9: for all term t 2 S do +10: Emit(term t, pair (S{t}, C{t})) +Figure 3.7: Pseudo-code for a MapReduce algorithm that computes the mean of values associated +with each key, illustrating the in-mapper combining design pattern. Only the mapper is +shown here; the reducer is the same as in Figure 3.6 +and one. The reducer would still arrive at the correct sum and count, and hence the +mean would be correct. Now add in the combiners: the algorithm would remain correct, +no matter how many times they run, since the combiners merely aggregate partial sums +and counts to pass along to the reducers. Note that although the output key-value type +of the combiner must be the same as the input key-value type of the reducer, the reducer +Mingshen Sun (CUHK) MapReduce & Hadoop +Pairs and Stripes +• To illustrate how constructing complex keys and values +improves the performance of computation. +26 +Mingshen Sun (CUHK) MapReduce & Hadoop +A New Running Example +• Problem: building a word co-occurrence matrix over a text +collection +• M = n * n matrix (n = number of unique words) +• m[i][j] = number of times word w[i] co-occurs with word w[j] within a +specific context (e.g., same sentence, same paragraph, same +document) +• it is easy to show that m[i][j] == m[j][i] +• Why this problem is interesting? +• distributional profiles of words +• information retrieval +• statistical natural language processing +27 +Mingshen Sun (CUHK) MapReduce & Hadoop +Challenge +• Space requirement: O(n^2). +• too big if we simply store the whole matrix with billions of words in +memory +• a single machine typically cannot keep the whole matrix +• How to use MapReduce to implement this large counting +problem? +• Our approach: +• mappers generate partial counts +• reducers aggregate partial counts +28 +Mingshen Sun (CUHK) MapReduce & Hadoop +Pairs +• Each mapper: +• Emits intermediate key-value pairs with each co-occurring word pair +and integer 1 +• Each reducer: +• Sums up all values associated with the same co-occurring word pair +• MapReduce execution framework guarantees that all values +associated with the same key are brought together in the reducer +29 +Mingshen Sun (CUHK) MapReduce & Hadoop +Pairs +• Can we use the default combiner here? +30 +3.2. PAIRS AND STRIPES 53 +1: class Mapper +2: method Map(docid a, doc d) +3: for all term w 2 doc d do +4: for all term u 2 Neighbors(w) do +5: Emit(pair (w, u), count 1) . Emit count for each co-occurrence +1: class Reducer +2: method Reduce(pair p, counts [c1, c2, . . .]) +3: s 0 +4: for all count c 2 counts [c1, c2, . . .] do +5: s s + c . Sum co-occurrence counts +6: Emit(pair p, count s) +Figure 3.8: Pseudo-code for the “pairs” approach for computing word co-occurrence matrices +from large corpora. +1: class Mapper +2: method Map(docid a, doc d) +Mingshen Sun (CUHK) MapReduce & Hadoop +Stripes +• Each mapper: +• For each particular word, stores co-occurrence information in an +associative array +• Emits intermediate key-value pairs with words as keys and +corresponding associative arrays as values +• Each reducer: +• Sums all the counts in the associative arrays +• MapReduce execution framework guarantees that all associative +arrays with the same key are brought together in the reducer +31 +Mingshen Sun (CUHK) MapReduce & Hadoop +Stripes +• Example: +• Each mapper emits +• a -> {b: count(b), c: count(c), d: count(d) …} +• Reducers perform element-wise sum of associative arrays +32 +(a, b) -> 1 +(a, c) -> 2 +(a, d) -> 5 +(a, e) -> 3 +(a, f) -> 2 +a -> {b: 1, c: 2, d: 5, e: 3, f: 2} +a -> {b: 1, , d: 5, e: 3 } ++ a -> {b: 1, c: 2, d: 2, f: 2} +———————————————————————————————————————— +a -> {b: 2, c: 2, d: 7, e: 3, f: 2} +Mingshen Sun (CUHK) MapReduce & Hadoop +Stripes +• pseudo-code of stripes approach +33 +1: class Mapper +2: method Map(docid a, doc d) +3: for all term w 2 doc d do +4: H new AssociativeArray +5: for all term u 2 Neighbors(w) do +6: H{u} H{u} + 1 . Tally words co-occurring with w +7: Emit(Term w, Stripe H) +1: class Reducer +2: method Reduce(term w, stripes [H1,H2,H3, . . .]) +3: Hf new AssociativeArray +4: for all stripe H 2 stripes [H1,H2,H3, . . .] do +5: Sum(Hf,H) . Element-wise sum +6: Emit(term w, stripe Hf ) +Figure 3.9: Pseudo-code for the “stripes” approach for computing word co-occurrence matrices +from large corpora. +Mingshen Sun (CUHK) MapReduce & Hadoop +Pairs v.s. Stripes +• Pairs: +• Pro: Easy to understand and implement +• Con: Generate many key-value pairs +• Stripes: +• Pro: Generate fewer key-value pairs +• Pro: Make better use of combiners +• Con: Memory size of associative arrays in mappers could be huge +• Both pairs and stripes can apply in-mapper combining +34 +Mingshen Sun (CUHK) MapReduce & Hadoop +Pairs v.s. Stripes +• stripes much faster than pairs +• linearity is maintained +35 +56 CHAPTER 3. MAPREDUCE ALGORITHM DESIGN +0 +500 +1000 +1500 +2000 +2500 +3000 +3500 +4000 +0 20 40 60 80 100 +Running time (seconds) +Percentage of the APW corpus +R2 = 0.992 +R2 = 0.999 +"stripes" approach +"pairs" approach +Figure 3.10: Running time of the “pairs” and “stripes” algorithms for computing word cooccurrence +matrices on di↵erent fractions of the APW corpus. These experiments were performed +on a Hadoop cluster with 19 slaves, each with two single-core processors and two disks. +5000 +Mingshen Sun (CUHK) MapReduce & Hadoop +Relative Frequencies +• Drawback of co-occurrence counts +• absolute counts doesn’t consider that some words appear more +frequently than others +• e.g., “is” occurs very often by itself +• doesn’t imply “is good” occurs more frequently than “Hello World” +• Estimate relative frequencies instead of counts +• How do we apply MapReduce to this problem? +36 +Relative Frequencies +Drawback of co-occurrence counts: +• Absolute counts doesn’t consider that some words +appear more frequently than others +• e.g., “is” occurs very often by itself. It doesn’t imply +“is good” occurs more frequently than “Hello World” +Estimate relative frequencies instead of counts: +How do we MapReduce to this problem? +31 +􀂦 +􀀠 􀀠 +' +count ( , ' ) +count ( , ) +count ( ) +count ( , ) +( | ) +B +A B +A B +A +A B +f B A +marginal +Mingshen Sun (CUHK) MapReduce & Hadoop +Relative Frequencies +• Computing relative frequencies with the stripes approach is +straightforward +• Sum all the counts in the associative array for each word +• Why is it possible in MapReduce? +• Drawback: assuming that each associative array fits into memory +• How to compute relative frequencies with the pairs +approach? +37 +Mingshen Sun (CUHK) MapReduce & Hadoop +Relative Frequencies with Pairs +• Mapper emits (a, *) for every word being observed +• Mapper makes sure same word goes to the same reducer +(use partitioner) +• Mapper makes suer (a, *) comes first, before individual +counts (how?) +• Reducer holds state to remember the count of (a, *), until all +pairs with the word “a” have been computed +38 +(a, *) -> 32 +(a, b1) -> 3 +(a, b2) -> 12 +(a, b3) -> 7 +(a, b4) -> 1 +… +reducer holds this value in +memory +(a, b1) -> 3/32 +(a, b2) -> 12/32 +(a, b3) -> 7/32 +(a, b4) -> 1/32 +… +Mingshen Sun (CUHK) MapReduce & Hadoop +Order Inversion +• Why order inversion? +• Computing relative frequencies requires marginal counts +• But marginal cannot be computed until you see all counts +• Buffering is a bad idea! +• Trick: getting the marginal counts to arrive at the reducer before the +joint counts +• MapReduce allows you to define the order of keys being +processed by the reducer +• shuffle and sort +39 +Mingshen Sun (CUHK) MapReduce & Hadoop +Order Inversion: Idea +• How to use the design pattern of order inversion to compute +relative frequencies via the pair approach? +• Emit a special key-value pair for each co-occurring word for the +computation of marginal +• Control the sort order of the intermediate key so that the marginal +count comes before individual counts +• Define a custom partitioner to ensure all pairs with the same left +word are shuffled to the same reducer +• Preserve state in reducer to remember the marginal count for each +word +40 +Mingshen Sun (CUHK) MapReduce & Hadoop +Secondary Sorting +• MapReduce sorts input to reducers by key +• values may be arbitrarily ordered +• What if want to sort value also? +• Scenario: +• sensors record temperature over time +• each sensor emits (id, time t, temperature v) +41 +Mingshen Sun (CUHK) MapReduce & Hadoop +Secondary Sorting +• Naive solution +• each sensor emits +• id -> (t, v) +• all readings of sensor id will be aggregated into a reducer +• buffer values in memory for all id, then sort +• Why is this a bad idea? +42 +Mingshen Sun (CUHK) MapReduce & Hadoop +Secondary Sorting +• Value-to-key conversion +• each mapper emits +• (id, t) -> v +• let execution framework do the sorting +• preserve state across multiple key-value pairs to handle processing +• anything else? +• Main idea: sorting is offloaded from the reducer (in naive +approach) to the MapReduce framework +43 +Mingshen Sun (CUHK) MapReduce & Hadoop +Tools for Synchronization +• Cleverly-constructed data structures +• Bring data together +• Sort order of intermediate keys +• Control order in which reducers process keys +• Partitioner +• Control which reducer processes which keys +• Preserving state in mappers and reducers +• Capture dependencies across multiple keys and values +44 +Mingshen Sun (CUHK) MapReduce & Hadoop +Issues and Tradeoffs +• Number of key-value pairs +• Object creation overhead +• Time for sorting and shuffling pairs across the network +• Size of each key-value pair +• De/serialization overhead +• Local aggregation +• Opportunities to perform local aggregation varies +• Combiners make a big difference +• Combiners vs. in-mapper combining +• RAM vs. disk vs. network +45 +Mingshen Sun (CUHK) MapReduce & Hadoop +Debugging at Scale +• Works on small datasets, won’t scale... why? +• Memory management issues (buffering and object +creation) +• Too much intermediate data +• Mangled input records +• Real-world data is messy! +• Word count: how many unique words in Wikipedia? +• There’s no such thing as “consistent data” +• Watch out for corner cases +• Isolate unexpected behavior, bring local +46 +Mingshen Sun (CUHK) MapReduce & Hadoop +Summary +• Design patterns +• in-mapper combing +• pairs and stripes +• order inversion +• value-to-key conversion +47 +Mingshen Sun (CUHK) MapReduce & Hadoop +MapReduce Application +• Text retrieval +• inverted indexing +• Data mining +• TF-IDF +• Graph algorithm +• parallel breadth-first search +• parallel dijkstra’s algorithm +• PageRank +48 +Mingshen Sun (CUHK) MapReduce & Hadoop +Web Search Problem +• Web search is to retrieve relevant web objects +• e.g., web pages, PDFs, PPT slides +• Web search problem +• crawling: gathering web content +• indexing: constructing search indexing structure +• retrieval: ranking documents given a query +• Challenge +• the web is huge +• billions of web objects, terabytes of information +• Performance goals +• query latency needs to be small +• scalable for a large number of documents +49 +Mingshen Sun (CUHK) MapReduce & Hadoop +Inverted Indexes +• Inverted Index +• A data structure that given a term provides access to the list of +documents that contain the term +• Used by most full-text search engines today +• By documents, we mean web objects +• Retrieval engine uses the inverted index to score documents +that contain the query terms based on some ranking model +• e.g., based on term matches, term proximity, term attributes, etc. +50 +Mingshen Sun (CUHK) MapReduce & Hadoop +Inverted Indexes +• Simple illustration of an inverted index. +• Each term is associated with a list of postings. +• Each posting is comprised of a document id and a payload, denoted +by p in this case. +• An inverted index provides quick access to documents ids that +contain a term. +51 +74 CHAPTER 4. INVERTED INDEXING FOR TEXT RETRIEVAL +terms postings +term1 +term2 +term3 +d1 p d5 p d6 p d11 p +d11 p d23 p d59 p d84 p +3 d1 p d4 p d11 p d19 p +1 4 11 19 Figure 4.1: Simple illustration of an inverted index. Each term is associated with a list postings. Each posting is comprised of a document id and a payload, denoted by p in this case. +An inverted index provides quick access to documents ids that contain a term. +the front of a postings list. Either way, an auxiliary data structure is necessary maintain the mapping from integer document ids to some other more meaningful handle, +such as a URL. +Given a query, retrieval involves fetching postings lists associated with query terms +and traversing the postings to compute the result set. In the simplest case, boolean +Mingshen Sun (CUHK) MapReduce & Hadoop +Inverted Indexes +• Given a query, retrieval involves fetching postings lists +associated with query terms and traversing the postings to +compute the result set. +• Simple Boolean retrieval: +• Apply union (OR) or intersection (AND) of posting lists +• General retrieval: +• Document scores are ranked +• Top k documents are returned +52 +Mingshen Sun (CUHK) MapReduce & Hadoop +Inverted Indexes +53 +one$fish,$two$fish +Doc$1 +red$fish,$blue$fish +Doc$2 +cat$in$the$hat +Doc$3 +1 +1 +1 +1 +1 +1 +1 2 3 +1 +1 +1 +4 +blue +cat +egg +fish +green +ham +hat +one +3 +4 +1 +4 +4 +3 +2 +1 +blue +cat +egg +fish +green +ham +hat +one +2 +green$eggs$and$ham +Doc$4 +red 1 +two 1 +red 2 +two 1 +Mingshen Sun (CUHK) MapReduce & Hadoop +Inverted Indexes: Construction +• How to construct an inverted index? +• Naive approach: +• For each document, extract all useful terms, and exclude all +stopwords (e.g., “the”, “a”, “of”) and remove affixes (e.g., “dogs” to +“dog”) +• For each term, add the posting (document, payload) to an existing +list, or create a posting list if the term is new +• Clearly, naive approach is not scalable if the document +collection is huge and each document is large +• Can we use MapReduce? +54 +Mingshen Sun (CUHK) MapReduce & Hadoop +Baseline Implementation +• Our goal: construct an inverted index given a document +collection +• Main idea: +• Input to each mapper: +• Document IDs (keys) +• Actual document content (values) +• What each mapper does: +• Analyze each document and extract useful terms +• Compute term frequencies (per document) +• Emit (term, posting) +• What each reducer does +• Aggregates all observed postings for each term +• Construct the posting list +55 +Mingshen Sun (CUHK) MapReduce & Hadoop +Baseline Implementation +56 +4.5. INDEX COMPRESSION 79 +1: class Mapper +2: method Map(docid n, doc d) +3: H new AssociativeArray +4: for all term t 2 doc d do +5: H{t} H{t} + 1 +6: for all term t 2 H do +7: Emit(tuple ht, ni, tf H{t}) +1: class Reducer +2: method Initialize +3: tprev ; +4: P new PostingsList +5: method Reduce(tuple ht, ni, tf [f]) +6: if t 6= tprev ^ tprev 6= ; then +7: Emit(term t, postings P) +8: P.Reset() +9: P.Add(hn, fi) +10: tprev t +11: method Close +12: Emit(term t, postings P) +Mingshen Sun (CUHK) MapReduce & Hadoop +Baseline Implementation +57 +4.4. INVERTED INDEXING: REVISED IMPLEMENTATION 77 +one fish, two fish +doc 1 +red fish, blue fish +doc 2 +one red bird +doc 3 +mapper mapper mapper +fish d1 2 +one d1 1 +two d1 1 +blue d2 1 +fish d2 2 +red d2 1 +bird d3 1 +one d3 1 +red d3 1 +reducer +Shuffle and Sort: aggregate values by keys +reducer fish d1 2 d2 2 bird d3 1 +one d1 1 +two d1 1 +blue d2 1 +red d2 1 d3 1 +d3 1 +Simple illustration of the baseline inverted indexing algorithm in MapReduce with +Mingshen Sun (CUHK) MapReduce & Hadoop +Baseline Implementation +• In the shuffle and sort phase, MapReduce framework forms +a large, distributed group by the postings of each term +• From reducer’s point of view +• Each input to the reducer is the resulting posting list of a term +• Reducer may sort the list (if needed), and writes the final output to +disk +• The task of each reducer is greatly simplified! MapReduce +framework has done most heavy liftings. +58 +Mingshen Sun (CUHK) MapReduce & Hadoop +Positional Indexes +59 +1 +1 +2 +1 +1 +2 2 +1 +1 +1 +1 +1 +1 +1 +1 +2 +one 1 +two 1 +fish 1 +one fish, two fish +Doc 1 +red 2 +blue 2 +fish 2 +red fish, blue fish +Doc 2 +cat 3 +hat 3 +cat in the hat +Doc 3 +fish 1 2 +one 1 +two 1 +red 2 +cat 3 +blue 2 +hat 3 +Shuffle and Sort: aggregate values by keys +Map +Reduce +Mingshen Sun (CUHK) MapReduce & Hadoop +Scalability Issue +• Scalability problem in baseline implementation +60 +4.3. INVERTED INDEXING: BASELINE IMPLEMENTATION 1: class Mapper +2: procedure Map(docid n, doc d) +3: H new AssociativeArray +4: for all term t 2 doc d do +5: H{t} H{t} + 1 +6: for all term t 2 H do +7: Emit(term t, posting hn,H{t}i) +1: class Reducer +2: procedure Reduce(term t, postings [hn1, f1i, hn2, f2i . . .]) +3: P new List +4: for all posting ha, fi 2 postings [hn1, f1i, hn2, f2i . . .] do +5: Append(P, ha, fi) +6: Sort(P) +7: Emit(term t, postings P) +Figure 4.2: Pseudo-code of the baseline inverted indexing algorithm in MapReduce. Any problem? +Mingshen Sun (CUHK) MapReduce & Hadoop +Scalability Issue +• Assumption of baseline implementation: +• Reducer has sufficient memory to hold all postings associated with +the same term +• Why? +• The MapReduce framework makes no guarantees about the +ordering of values associated with the same key. +• The reducer first buffers all postings (line 5) and then performs an +in-memory sort before writing the postings to disk +61 +Mingshen Sun (CUHK) MapReduce & Hadoop +Scalability Issue +• How to solve? Key idea is to let MapReduce framework do +sorting for us +• Instead of emitting +• (term t, posting ) +• Emit +• (tuple , f) +• Value-to-key conversion!! +62 +Mingshen Sun (CUHK) MapReduce & Hadoop +Revised Implementation +• With value-to-key conversion, the MapReduce framework +ensures the postings arrive in sorted order (based on ) +• Results can be written to disk directly +• Caution: you need a customized partitioner to ensure that all +tuples with the same term are shuffled to the same reducer +63 +Mingshen Sun (CUHK) MapReduce & Hadoop +Revised Implementation +64 +4.5. INDEX COMPRESSION 79 +1: class Mapper +2: method Map(docid n, doc d) +3: H new AssociativeArray +4: for all term t 2 doc d do +5: H{t} H{t} + 1 +6: for all term t 2 H do +7: Emit(tuple ht, ni, tf H{t}) +1: class Reducer +2: method Initialize +3: tprev ; +4: P new PostingsList +5: method Reduce(tuple ht, ni, tf [f]) +6: if t 6= tprev ^ tprev 6= ; then +7: Emit(term t, postings P) +8: P.Reset() +9: P.Add(hn, fi) +10: tprev t +11: method Close +12: Emit(term t, postings P) +Figure 4.4: Pseudo-code of a scalable inverted indexing algorithm in MapReduce. By applying +results are directly written to +disk +Mingshen Sun (CUHK) MapReduce & Hadoop +TF-IDF +• Term Frequency – Inverse Document Frequency (TF-IDF) +• Answers the question “How important is this term in a document” +• Known as a term weighting function +• Assigns a score (weight) to each term (word) in a document +• Very commonly used in text processing and search +• Has many applications in data mining +65 +Mingshen Sun (CUHK) MapReduce & Hadoop +TF-IDF Motivation +• Merely counting the number of occurrences of a word in a +document is not a good enough measure of its relevance +• If the word appears in many other documents, it is probably less +relevance +• Some words appear too frequently in all documents to be relevant +• Known as ‘stopwords’ +• TF-IDF considers both the frequency of a word in a given +document and the number of documents which contain the +word +66 +Mingshen Sun (CUHK) MapReduce & Hadoop +TF-IDF: Definition +• Term Frequency (TF) +• Number of times a term appears in a +• document (i.e., the count) +• Inverse Document Frequency (IDF) +• N: total number of documents +• n: number of documents that contain a term +• TF-IDF +• TF × IDF +67 +idf = log ( +N +n +) +Mingshen Sun (CUHK) MapReduce & Hadoop +Computing TF-IDF With MapReduce +• Overview of algorithm: 3 MapReduce jobs +• Job 1: compute term frequencies +• Job 2: compute number of documents each word +occurs in +• Job 3: compute TD-IDF +68 +Mingshen Sun (CUHK) MapReduce & Hadoop +Graph: Real-World Problems +• Finding shortest paths +• Routing Internet traffic and UPS trucks +• Finding minimum spanning trees +• Telco laying down fiber +• Finding Max Flow +• Airline scheduling +• Identify “special” nodes and communities +• Breaking up terrorist cells, spread of avian flu +• Bipartite matching +• Monster.com, Match.com +• PageRank +69 +Mingshen Sun (CUHK) MapReduce & Hadoop +Graphs and MapReduce +• Graph algorithms typically involve: +• Performing computations at each node: based on +node features, edge features, and local link structure +• Propagating computations: “traversing” the graph +• Challenge: +• Algorithms running on a single machine and putting +the entire graph in memory are not scalable +• Key questions: +• How do you represent graph data in MapReduce? +• How do you traverse a graph in MapReduce? +70 +Mingshen Sun (CUHK) MapReduce & Hadoop +Graph Representations +• Two common representations +• adjacency matrix +• adjacency list +71 +5.1. GRAPH REPRESENTATIONS n1 +n2 +n1 n2 n3 n4 n5 +n1 0 1 0 1 0 +n2 0 0 1 0 1 +n1 [n2, n4] +n2 [n3, n5] +n3 +n5 +n3 0 0 0 1 0 +n4 0 0 0 0 1 +n5 1 1 1 0 0 +n3 [n4] +n4 [n5] +n5 [n1, n2, n3] +n4 adjacency matrix adjacency lists +Figure 5.1: A simple directed graph (left) represented as an adjacency matrix (middle) and +adjacency lists (right). +parallel breadth-first search (Section 5.2) and PageRank (Section 5.3). Before concluding +• easy to manipulate with linear algebra +• easy algorithmic implementation +• large memory space, esp. for sparse +graph +• much more compact representation +• easy to compute over out-links +• much more difficult to compute over +in-links +• How ever, the shuffle and sort +mechanism in MapReduce provides +an easy way to group edges by +destination nodes. +Mingshen Sun (CUHK) MapReduce & Hadoop +Single-Source Shortest Path +• Problem: find shortest paths from a source node to all other +nodes in the graph +• Shortest mean smallest hop counts or lowest weights +• Algorithm: +• Breadth-first-search: for finding minimum hop counts +• Dijkstra’s algorithm: for finding minimum-cost paths for general +graphs +72 +Mingshen Sun (CUHK) MapReduce & Hadoop +Dijkstra’s Algorithm +73 +96 CHAPTER 5. GRAPH ALGORITHMS +∞ 1 ∞ +n2 n4 +10 1 ∞ +n2 n4 +8 1 14 +n2 n4 +0 +10 +5 +2 3 +9 +4 6 +n1 +0 +10 +5 +2 3 +9 +4 6 +n1 +0 +10 +5 +2 3 +9 +4 6 +n1 +∞ ∞ +2 +7 1 +n3 n5 +5 ∞ +2 +7 1 +n3 n5 +5 7 +2 +7 1 +n3 n5 +(a) (b) (c) +8 13 +10 +1 +n2 n4 +8 9 +10 +1 +n2 n4 +8 9 +10 +1 +n2 n4 +0 +5 7 +5 +2 3 +9 +7 +4 6 +n1 +0 +5 7 +5 +2 3 +9 +7 +4 6 +n1 +0 +5 7 +5 +2 3 +9 +7 +4 6 +n1 +2 +n3 n5 +2 +n3 n5 +2 +n3 n5 +(d) (e) (f) +Figure 5.3: Example of Dijkstra’s algorithm applied to a simple graph with five nodes, with n1 +as the source and edge distances as indicated. Parts (a)–(e) show the running of the algorithm +at each iteration, with the current distance inside the node. Nodes with thicker borders are +those being expanded; nodes that have already been expanded are shown in black. +Mingshen Sun (CUHK) MapReduce & Hadoop +Dijkstra’s Algorithm +• Dijkstra’s algorithm is designed as a sequential algorithm +• Key to Dijkstra’s algorithm +• Priority queue that maintains a globally sorted list of nodes by +current distance +• Not possible in MapReduce, which doesn’t provide a mechanism for +exchanging global data +• Solution: +• Brute-force approach: parallel breadth first search +• Brute force: Try to revisit many nodes that have been visited +74 +Mingshen Sun (CUHK) MapReduce & Hadoop +Parallel BFS +• Consider simple case of equal edge weights +• Solution to the problem can be defined inductively +• Here’s the intuition: +• Define: b is reachable from a if b is on adjacency list of a +• DistanceTo(s) = 0 +• For all nodes p reachable from s, DistanceTo(p) = 1 +• For all nodes n reachable from some other set of nodes M, +DistanceTo(n) = 1 + min(DistanceTo(m), m \in M) +75 +s +m3 +m2 +m1 +n +… +… +… +d1 +d2 +d3 +Mingshen Sun (CUHK) MapReduce & Hadoop +Visualizing Parallel BFS +76 +n0 +n3 n2 +n1 +n7 +n6 +n5 +n4 +n9 +n8 +Mingshen Sun (CUHK) MapReduce & Hadoop +From Intuition to Algorithm +• Data representation: +• Key: node n +• Value: d (distance from start), adjacency list (nodes reachable from +n) +• Initialization: for all nodes except for start node, d = infinity +• Mapper: +• exit m in adjacency list: emit (m, d + 1) +• Sort/Shuffle +• Groups distances by reachable nodes +• Reducer: +• Selects minimum distance path for each reachable node +• Additional bookkeeping needed to keep track of actual path +77 +Mingshen Sun (CUHK) MapReduce & Hadoop +Multiple Iterations Needed +• Each MapReduce iteration advances the “frontier” by one +hop +• Subsequent iterations include more and more reachable nodes as +frontier expands +• Multiple iterations are needed to explore entire graph +• Preserving graph structure: +• Problem: Where did the adjacency list go? +• Solution: mapper emits (n, adjacency list) as well +78 +Mingshen Sun (CUHK) MapReduce & Hadoop +BFS Pseudo-Code +79 +Mingshen Sun (CUHK) MapReduce & Hadoop +Stopping Criterion +• How many iterations are needed in parallel BFS (equal edge +weight case)? +• Convince yourself: when a node is first “discovered”, we’ve +found the shortest path +• In practice, we iterate the algorithm until all node distances +are found (i.e., no more infinity) +• How? +• Maintain a counter inside the MapReduce program (i.e., count how +many node distances are found) +• Require a non-MapReduce driver program to submit a MapReduce +job to iterate the algorithm +• The driver program checks the counter value before submitting +another job +80 +Mingshen Sun (CUHK) MapReduce & Hadoop +Extend to General Weights +• Difference? +• How many iterations are needed in parallel BFS? +• How do we know that all shortest path distances are found? +81 +Mingshen Sun (CUHK) MapReduce & Hadoop +Other Graph Algorithms +• PageRank +• Subgraph pattern matching +• Computing simple graph statistics +• Degree vertex distributions +• Computing more complex graph statics +• Clustering coefficient +• Counting triangles +82 +Mingshen Sun (CUHK) MapReduce & Hadoop +Random Walks Over the Web +• Random surfer model: +• User starts at a random Web page +• User randomly clicks on links, surfing from page to page +• PageRank +• Characterizes the amount of time spent on any given page +• Mathematically, a probability distribution over pages +• PageRank captures notions of page importance +• Correspondence to human intuition? +• One of thousands of features used in web search (queryindependent) +83 +Mingshen Sun (CUHK) MapReduce & Hadoop +PageRank: Definition +• Given page x with inlinks t1…tn, where +• C(t) is the out-degree of t +• is probability of random jump +• N is the total number of nodes in the graph +84 +↵ +PR(x) = ↵ +✓ +1 +N +◆ ++ (1 \ No newline at end of file diff --git a/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..a58c00c287c619eed922f553c0dccdd2a00a4251 Binary files /dev/null and b/src/main/resources/cdtocode/doc/Hadoop MapReduce/mapreduce_hadoop2.txt.xml.xls differ diff --git a/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls b/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls new file mode 100644 index 0000000000000000000000000000000000000000..6a6f6160f576ae388ad7590c4c4f77b9dcfac6e7 Binary files /dev/null and b/src/main/resources/cdtocode/zbackup-Apache OODT File Manager.xls differ diff --git a/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls b/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls new file mode 100644 index 0000000000000000000000000000000000000000..a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64 Binary files /dev/null and b/src/main/resources/cdtocode/zbackup-Hadoop HDFS.xls differ diff --git a/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls b/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls new file mode 100644 index 0000000000000000000000000000000000000000..a1a7baa99be8dfdea1f8c4fac937be4edf4d5c64 Binary files /dev/null and b/src/main/resources/cdtocode/zbackup-Hadoop MapReduce.xls differ diff --git a/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java b/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java index 2c146e9ea10961e73fcf63ca0b06d02953c04cbb..bd5ad1f71249da2b95ea31fdce0a66c7051b52ae 100644 --- a/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java +++ b/src/test/java/com/hy/java/uct/cdtocode/CodeRelationMapperTest.java @@ -2,6 +2,7 @@ package com.hy.java.uct.cdtocode; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -30,6 +31,13 @@ import com.hy.java.uct.util.UMLClass; import com.hy.java.utility.common.FileEditor; import com.hy.java.utility.common.Pair; +import jxl.Workbook; +import jxl.write.Label; +import jxl.write.WritableSheet; +import jxl.write.WritableWorkbook; +import jxl.write.WriteException; +import jxl.write.biff.RowsExceededException; + public class CodeRelationMapperTest { /** * 将要追踪的类图放在cd_dir目录下 @@ -46,6 +54,67 @@ public class CodeRelationMapperTest { */ private static final String test_data_dir = System.getProperty("user.dir") + "\\src\\test\\resources\\"; + /** + * 将追踪结果放在res_dir目录下 + * + * 类,追踪到的代码,追踪概率,参考关系,参考关系的目标 + */ + private static final String res_dir = System.getProperty("user.dir") + "\\src\\test\\resources\\cdtocode\\"; + + @Test + public void write() { + try { + // 工作簿 + WritableWorkbook workbook = Workbook.createWorkbook(new File(res_dir + "Apache OODT File Manager.xls")); + if (workbook != null) { + // 新建第一个工作表 + WritableSheet sheets = workbook.createSheet("Sheet1", 0); + // 构建工作表的表头 + Label label1 = new Label(0, 0, "类"); + sheets.addCell(label1); + Label label2 = new Label(1, 0, "追踪到的代码"); + sheets.addCell(label2); + Label label3 = new Label(2, 0, "追踪概率"); + sheets.addCell(label3); + Label label4 = new Label(3, 0, "参考关系类型"); + sheets.addCell(label4); + Label label5 = new Label(4, 0, "参考关系的目标"); + sheets.addCell(label5); + // 从第二行开始,写一下数据 + int rows = 5; + for (int row = 1; row < rows; row++) { + // 类 + Label _class = new Label(0, row, "Factory" + row); + sheets.addCell(_class); + // 追踪到的代码 + Label code = new Label(1, row, "Factory" + row + ".java"); + sheets.addCell(code); + // 追踪概率 + Label ratio = new Label(2, row, String.valueOf(Math.random())); + sheets.addCell(ratio); + // 参考关系类型 + Label ref_relation = new Label(3, row, "依赖"); + sheets.addCell(ref_relation); + // 参考关系的目标 + Label ref_relation_target = new Label(4, row, "adsofuasd fsaduofhasf asd,"); + sheets.addCell(ref_relation_target); + } + // 写入文件 + workbook.write(); + workbook.close(); + } + } catch (RowsExceededException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (WriteException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + @Test public void parseAttri() { FileEditor fe = new FileEditor(test_data_dir + "attri"); diff --git a/src/test/java/com/hy/java/uct/cdtocode/DocAnalyzerTest.java b/src/test/java/com/hy/java/uct/cdtocode/DocAnalyzerTest.java index fa3502905072462bc82fadb07c4552298b3f4624..a3299cb666849bb9dee9eb294477468c0d27b455 100644 --- a/src/test/java/com/hy/java/uct/cdtocode/DocAnalyzerTest.java +++ b/src/test/java/com/hy/java/uct/cdtocode/DocAnalyzerTest.java @@ -1,5 +1,7 @@ package com.hy.java.uct.cdtocode; +import java.io.File; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -17,6 +19,10 @@ import com.hy.java.uct.util.UMLClass; import edu.stanford.nlp.pipeline.CoreDocument; import edu.stanford.nlp.pipeline.CoreSentence; import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; +import jxl.write.WritableWorkbook; public class DocAnalyzerTest { /** @@ -27,7 +33,38 @@ public class DocAnalyzerTest { /** * 将设计文档放在doc_dir目录下 */ - private static final String doc_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\doc\\"; + private static final String src_doc_dir = System.getProperty("user.dir") + "\\src\\main\\resources\\cdtocode\\doc\\"; + private static final String test_doc_dir = System.getProperty("user.dir") + "\\src\\test\\resources\\cdtocode\\doc\\"; + + @Test + public void jxl() { + try { + Workbook book = Workbook.getWorkbook(new File(src_doc_dir + "Apache OODT File Manager\\A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls")); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("Sheet0"); + // Sheet sheet = book.getSheet(0); + int rows = sheet.getRows(); + int cols = sheet.getColumns(); + System.out.println("总列数:" + cols); + System.out.println("总行数:" + rows); + System.out.println("----------------------------"); + int i = 0; + int j = 0; + // 循环读取数据 + for (i = 0; i < cols; i++) { + for (j = 0; j < rows; j++) { + System.out.println("第" + j + "行,第" + i + "列为:" + sheet.getCell(i, j).getContents()); + } + + } + } catch (BiffException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } @Test public void regexTest() { @@ -130,8 +167,8 @@ public class DocAnalyzerTest { */ List doc_ls = new ArrayList<>(); // 在这儿添加多个文件 - doc_ls.add(doc_dir + "basic-architecture.adoc"); - doc_ls.add(doc_dir + "Jetty10 Operations Guide _ The Eclipse Foundation.txt"); + doc_ls.add(test_doc_dir + "basic-architecture.adoc"); + doc_ls.add(test_doc_dir + "Jetty10 Operations Guide _ The Eclipse Foundation.txt"); Map> doc = DocReader.readDocs(doc_ls); /* * 3、分析文档信息。实际相当于增加类图中的UMLclass、类本身的内容、类之间关系 diff --git a/src/test/java/com/hy/java/uct/cdtocode/VSMAndLSIParserTest.java b/src/test/java/com/hy/java/uct/cdtocode/VSMAndLSIParserTest.java new file mode 100644 index 0000000000000000000000000000000000000000..b13d074b8854ea835ad85a4f6ffe6165555596d6 --- /dev/null +++ b/src/test/java/com/hy/java/uct/cdtocode/VSMAndLSIParserTest.java @@ -0,0 +1,42 @@ +package com.hy.java.uct.cdtocode; + +import java.io.File; +import java.io.IOException; + +import org.junit.jupiter.api.Test; + +import jxl.Sheet; +import jxl.Workbook; +import jxl.read.biff.BiffException; + +public class VSMAndLSIParserTest { + private final String vsm_dir = "D:\\eclipse-committers\\uml-code-trace\\src\\test\\resources\\cdtocode\\results-vsm.xls"; + private final String lsi_dir = "D:\\eclipse-committers\\uml-code-trace\\src\\test\\resources\\cdtocode\\results-lsi.xls"; + + @Test + public void parse() { + try { + // 工作簿 + Workbook book = Workbook.getWorkbook(new File(vsm_dir)); + // 获得第一个工作表对象 + Sheet sheet = book.getSheet("results-vsm"); + // Sheet sheet = book.getSheet(0); + int rows = sheet.getRows(); + int cols = sheet.getColumns(); + // 对每行数据,去除source和target中的空格。解析关系类型 + for (int row = 0; row < rows; row++) { + for (int column = 0; column < cols; column++) { + System.out.print(sheet.getCell(column, row).getContents() + "\t"); + } + System.out.println(); + } + book.close(); + } catch (BiffException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +} diff --git a/src/test/resources/cdtocode/Apache OODT File Manager.xls b/src/test/resources/cdtocode/Apache OODT File Manager.xls new file mode 100644 index 0000000000000000000000000000000000000000..dcdc77b2b68dbbb0736383126f5548f851689eed Binary files /dev/null and b/src/test/resources/cdtocode/Apache OODT File Manager.xls differ diff --git a/src/test/resources/cdtocode/doc/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls b/src/test/resources/cdtocode/doc/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls new file mode 100644 index 0000000000000000000000000000000000000000..c8445c9256f74a448d604d7f7c91c56d49431289 Binary files /dev/null and b/src/test/resources/cdtocode/doc/A FRAMEWORK FOR COLLABORATIVE REVIEW OF CANDIDATE EVENTS IN HIGH DATA RATE STREAMS THE V-FASTR EXPERIMENT AS A CASE STUDY.txt.xml.xls differ diff --git a/src/test/resources/cdtocode/results-lsi.xls b/src/test/resources/cdtocode/results-lsi.xls new file mode 100644 index 0000000000000000000000000000000000000000..4a1062ae3e9bd61b318e8328ca43bcbe03fd6171 Binary files /dev/null and b/src/test/resources/cdtocode/results-lsi.xls differ diff --git a/src/test/resources/cdtocode/results-vsm.xls b/src/test/resources/cdtocode/results-vsm.xls new file mode 100644 index 0000000000000000000000000000000000000000..de1c317fda6ce388e81b09fbc2abdf9bebdd1174 Binary files /dev/null and b/src/test/resources/cdtocode/results-vsm.xls differ