From a3e3d6d0505b2e5cad0b4c6d61ccd04855a6f6c0 Mon Sep 17 00:00:00 2001
From: Henrik <henrik.hoiness@online.no>
Date: Wed, 25 Jul 2018 14:38:29 +0200
Subject: Created parser that takes in ES-documents and ES-mapping and parses
 them into Vespa-documents

---
 config-model/src/main/python/ES_Vespa_parser.py | 226 ++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 config-model/src/main/python/ES_Vespa_parser.py

diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py
new file mode 100644
index 00000000000..019eb463a79
--- /dev/null
+++ b/config-model/src/main/python/ES_Vespa_parser.py
@@ -0,0 +1,226 @@
+# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+import json
+import argparse
+import os, sys
+
+
+# noinspection PyInterpreter
+class ElasticSearchParser:
+
+    document_file = None
+    mapping_file = None
+    search_definitions = {}
+    path = ""
+    _all = True
+    all_mappings = {}
+    no_index = []
+    types = []
+
+    def __init__(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("documents", help="location of file with documents to be parsed")
+        parser.add_argument("mappings", help="location of file with mappings")
+        args = parser.parse_args()
+
+        self.document_file = args.documents
+        self.mapping_file = args.mappings
+
+    def main(self):
+        self.path = os.getcwd() + "/documents_and_document_definitions/"
+        try:
+            os.mkdir(self.path, 0755);
+            print(" > Created folder '" + self.path + "'")
+        except OSError:
+            print(" > Folder '"+ self.path +"' already existed")
+
+
+        try:
+            os.mkdir(self.path + "searchdefinitions/", 0755);
+            print(" > Created folder '" + self.path + "searchdefinitions/" + "'")
+        except OSError:
+            print(" > Folder '"+ self.path + "searchdefinitions/" +"' already existed")
+
+        self.parse()
+        self.createServices_xml()
+        self.createHosts_xml()
+
+
+
+    def getMapping(self, type):
+        unparsed_mapping_file = open(self.mapping_file, "r")
+        type_mapping = {}
+        for line in unparsed_mapping_file:
+            data = json.loads(line)
+            index = data.keys()[0]
+            mappings = data[index]["mappings"][type]["properties"]
+
+            #Checking if some fields could be no-index
+            try:
+                _all_enabled = data[index]["mappings"][type]["_all"]["enabled"]
+                if not _all_enabled:
+                    self._all = False
+                    print(" > All fields in the document type '" + type + "' is not searchable. Go inside "+self.path + type + ".sd to add which fields that should be searchable")
+            except KeyError:
+                print(" > All fields in the document type '"+type+"' is searchable")
+
+            self.walk(mappings, type_mapping, "properties")
+
+        unparsed_mapping_file.close()
+        if type not in self.search_definitions:
+            self.search_definitions[type] = True
+            self.types.append(type)
+            self.createSearchDefinition(type, type_mapping)
+
+        # Adding mapping to global map with mappings
+        self.all_mappings[type] = type_mapping
+        return type_mapping
+
+    def parse(self):
+        file_path = self.path + "documents" + ".json"
+        unparsed_document_file = open(self.document_file, "r")
+        vespa_docs = open(file_path, "w")
+
+        for line in unparsed_document_file:
+            data = json.loads(line)
+            type = data["_type"]
+
+            parsed_data = {
+                "put": "id:application_name:" + type + "::" + data["_id"],
+                "fields": {}
+            }
+
+            # Checking for already existing mapping for a type, if not create a new
+            if type in self.all_mappings:
+                mapping = self.all_mappings[type]
+            else:
+                mapping = self.getMapping(type)
+
+            for key, item in mapping.items():
+                try:
+                    parsed_data["fields"][key] = data["_source"][key]
+                except KeyError:
+                    continue
+
+            json.dump(parsed_data, vespa_docs)
+            vespa_docs.write("\n")
+
+        vespa_docs.close()
+        unparsed_document_file.close()
+        print(" > Parsed all documents '" + ", ".join(self.types) + "'" + "' at '" + file_path + "'")
+
+    def createSearchDefinition(self, type, type_mapping):
+        file_path = self.path + "searchdefinitions/" + type + ".sd"
+        new_sd = open(file_path, "w")
+        new_sd.write("search "+ type + " {\n")
+        new_sd.write("    document "+ type + " {\n")
+
+        for key, item in type_mapping.items():
+            new_sd.write("        field " + key + " type " + self.get_type(item) + " {\n")
+            new_sd.write("            indexing: " + self.get_indexing(key, self.get_type(item)) + "\n")
+            new_sd.write("        }\n")
+
+        new_sd.write("    }\n")
+        new_sd.write("}\n")
+        new_sd.close()
+        print(" > Created search definition for '" + type + "' at '" + file_path + "'")
+
+    def createServices_xml(self):
+        file_path = self.path + "services.xml"
+        new_services = open(file_path, "w")
+        template = ("<?xml version='1.0' encoding='UTF-8'?>"
+        "<services version='1.0'>\n\n"
+        "  <container id='default' version='1.0'>\n"
+        "    <search/>\n"
+        "    <document-api/>\n"
+        "    <nodes>\n"
+        "      <node hostalias='node1'/>\n"
+        "    </nodes>\n"
+        "  </container>\n\n"
+        "  <content id='content' version='1.0'>\n"
+        "    <redundancy>1</redundancy>\n"
+        "    <search>\n"
+        "      <visibility-delay>1.0</visibility-delay>\n"
+        "    </search>\n"
+        "    <documents>\n")
+
+        for i in range(0,len(self.types)):
+            template += "      <document mode='index' type='"+self.types[i]+"'/>\n"
+
+
+        template += ("    </documents>\n"
+        "    <nodes>\n"
+        "      <node hostalias='node1' distribution-key=\"0\"/>\n"
+        "    </nodes>\n"
+        "    <engine>\n"
+        "      <proton>\n"
+        "        <searchable-copies>1</searchable-copies>\n"
+        "      </proton>\n"
+        "    </engine>\n"
+        "  </content>\n\n"
+        "</services>")
+
+        new_services.write(template)
+        new_services.close()
+        print(" > Created services.xml at '" + file_path + "'")
+
+
+
+    def createHosts_xml(self):
+        file_path = self.path + "hosts.xml"
+        new_hosts = open(file_path, "w")
+        template = ("<?xml version=\"1.0\" encoding=\"utf-8\" ?>"
+                    "<hosts>"
+                    "  <host name=\"localhost\">"
+                    "    <alias>node1</alias>"
+                    "  </host>" 
+                    "</hosts>")
+
+        new_hosts.write(template)
+        new_hosts.close()
+        print(" > Created hosts.xml at '" + file_path + "'")
+
+
+    def get_type(self, type):
+        return {
+            "text": "string",
+            "keyword": "string",
+            "date": "string",
+            "long": "long",
+            "double": "double",
+            "boolean": "string",
+            "ip": "text",
+            "byte" : "byte",
+            "float" : "float",
+
+        }[type]
+
+
+    def get_indexing(self, key, key_type):
+        if not self._all:
+            return "summary"
+
+        if key not in self.no_index:
+            if key_type == "string":
+                return "summary | index"
+            else:
+                return "summary | attribute"
+
+        return "summary"
+
+
+    def walk(self, node, mapping, parent):
+        for key, item in node.items():
+            if isinstance(item, dict):
+                self.walk(item, mapping, key)
+            elif key == "type":
+                mapping[parent] = item
+            elif key == "include_in_all":
+                if not item:                    # Field should not be searchable
+                    self.no_index.append(parent)
+            elif key == "index" and parent != "properties":
+                if item == "no":                 # Field should not be searchable
+                    self.no_index.append(parent)
+
+
+if __name__ == '__main__':
+    ElasticSearchParser().main()
-- 
cgit v1.2.3


From f9407be3f00c58d5034465fe8305ba92811630ba Mon Sep 17 00:00:00 2001
From: Henrik <henrik.hoiness@online.no>
Date: Wed, 25 Jul 2018 14:56:57 +0200
Subject: Added description, example of usage and author

---
 config-model/src/main/python/ES_Vespa_parser.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py
index 019eb463a79..91b576d5788 100644
--- a/config-model/src/main/python/ES_Vespa_parser.py
+++ b/config-model/src/main/python/ES_Vespa_parser.py
@@ -4,7 +4,11 @@ import argparse
 import os, sys
 
 
-# noinspection PyInterpreter
+# Parsing Elastic Search documents to Vespa documents
+# Example of usage:  python ES_Vespa_parser.py my_index.json my_index_mapping.json
+__author__ = 'henrhoi'
+
+
 class ElasticSearchParser:
 
     document_file = None
-- 
cgit v1.2.3


From 4a33b491a005ba6ed6f66a5c0a361bd7e67e3c72 Mon Sep 17 00:00:00 2001
From: Henrik <henrik.hoiness@online.no>
Date: Thu, 26 Jul 2018 15:20:28 +0200
Subject: Added arg --application_name, now creating folder for searchdefs ++

---
 config-model/src/main/python/ES_Vespa_parser.py | 118 +++++++++++-------------
 1 file changed, 55 insertions(+), 63 deletions(-)

diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py
index 91b576d5788..477b0db4744 100644
--- a/config-model/src/main/python/ES_Vespa_parser.py
+++ b/config-model/src/main/python/ES_Vespa_parser.py
@@ -3,16 +3,15 @@ import json
 import argparse
 import os, sys
 
-
 # Parsing Elastic Search documents to Vespa documents
 # Example of usage:  python ES_Vespa_parser.py my_index.json my_index_mapping.json
 __author__ = 'henrhoi'
 
 
 class ElasticSearchParser:
-
     document_file = None
     mapping_file = None
+    application_name = None
     search_definitions = {}
     path = ""
     _all = True
@@ -22,50 +21,49 @@ class ElasticSearchParser:
 
     def __init__(self):
         parser = argparse.ArgumentParser()
-        parser.add_argument("documents", help="location of file with documents to be parsed")
-        parser.add_argument("mappings", help="location of file with mappings")
+        parser.add_argument("documents_path", help="location of file with documents to be parsed", type=str)
+        parser.add_argument("mappings_path", help="location of file with mappings", type=str)
+        parser.add_argument("--application_name", help="name of application", default="application_name", type=str)
         args = parser.parse_args()
 
-        self.document_file = args.documents
-        self.mapping_file = args.mappings
+        self.document_file = args.documents_path
+        self.mapping_file = args.mappings_path
+        self.application_name = args.application_name
 
     def main(self):
-        self.path = os.getcwd() + "/documents_and_document_definitions/"
+        self.path = os.getcwd() + "/application/"
         try:
-            os.mkdir(self.path, 0755);
+            os.mkdir(self.path, 0o777)
             print(" > Created folder '" + self.path + "'")
         except OSError:
-            print(" > Folder '"+ self.path +"' already existed")
-
+            print(" > Folder '" + self.path + "' already existed")
 
         try:
-            os.mkdir(self.path + "searchdefinitions/", 0755);
+            os.makedirs(self.path + "searchdefinitions/", 0o777)
             print(" > Created folder '" + self.path + "searchdefinitions/" + "'")
         except OSError:
-            print(" > Folder '"+ self.path + "searchdefinitions/" +"' already existed")
+            print(" > Folder '" + self.path + "searchdefinitions/" + "' already existed")
 
         self.parse()
         self.createServices_xml()
         self.createHosts_xml()
 
-
-
     def getMapping(self, type):
         unparsed_mapping_file = open(self.mapping_file, "r")
         type_mapping = {}
         for line in unparsed_mapping_file:
             data = json.loads(line)
-            index = data.keys()[0]
+            index = list(data.keys())[0]
             mappings = data[index]["mappings"][type]["properties"]
 
-            #Checking if some fields could be no-index
+            # Checking if some fields could be no-index
             try:
                 _all_enabled = data[index]["mappings"][type]["_all"]["enabled"]
                 if not _all_enabled:
                     self._all = False
-                    print(" > All fields in the document type '" + type + "' is not searchable. Go inside "+self.path + type + ".sd to add which fields that should be searchable")
+                    print(" > All fields in the document type '" + type + "' is not searchable. Go inside " + self.path + type + ".sd to add which fields that should be searchable")
             except KeyError:
-                print(" > All fields in the document type '"+type+"' is searchable")
+                print(" > All fields in the document type '" + type + "' is searchable")
 
             self.walk(mappings, type_mapping, "properties")
 
@@ -89,7 +87,7 @@ class ElasticSearchParser:
             type = data["_type"]
 
             parsed_data = {
-                "put": "id:application_name:" + type + "::" + data["_id"],
+                "put": "id:"+self.application_name+":" + type + "::" + data["_id"],
                 "fields": {}
             }
 
@@ -115,8 +113,8 @@ class ElasticSearchParser:
     def createSearchDefinition(self, type, type_mapping):
         file_path = self.path + "searchdefinitions/" + type + ".sd"
         new_sd = open(file_path, "w")
-        new_sd.write("search "+ type + " {\n")
-        new_sd.write("    document "+ type + " {\n")
+        new_sd.write("search " + type + " {\n")
+        new_sd.write("    document " + type + " {\n")
 
         for key, item in type_mapping.items():
             new_sd.write("        field " + key + " type " + self.get_type(item) + " {\n")
@@ -132,58 +130,54 @@ class ElasticSearchParser:
         file_path = self.path + "services.xml"
         new_services = open(file_path, "w")
         template = ("<?xml version='1.0' encoding='UTF-8'?>"
-        "<services version='1.0'>\n\n"
-        "  <container id='default' version='1.0'>\n"
-        "    <search/>\n"
-        "    <document-api/>\n"
-        "    <nodes>\n"
-        "      <node hostalias='node1'/>\n"
-        "    </nodes>\n"
-        "  </container>\n\n"
-        "  <content id='content' version='1.0'>\n"
-        "    <redundancy>1</redundancy>\n"
-        "    <search>\n"
-        "      <visibility-delay>1.0</visibility-delay>\n"
-        "    </search>\n"
-        "    <documents>\n")
-
-        for i in range(0,len(self.types)):
-            template += "      <document mode='index' type='"+self.types[i]+"'/>\n"
-
+                    "<services version='1.0'>\n\n"
+                    "  <container id='default' version='1.0'>\n"
+                    "    <search/>\n"
+                    "    <document-api/>\n"
+                    "    <nodes>\n"
+                    "      <node hostalias='node1'/>\n"
+                    "    </nodes>\n"
+                    "  </container>\n\n"
+                    "  <content id='content' version='1.0'>\n"
+                    "    <redundancy>1</redundancy>\n"
+                    "    <search>\n"
+                    "      <visibility-delay>1.0</visibility-delay>\n"
+                    "    </search>\n"
+                    "    <documents>\n")
+
+        for i in range(0, len(self.types)):
+            template += "      <document mode='index' type='" + self.types[i] + "'/>\n"
 
         template += ("    </documents>\n"
-        "    <nodes>\n"
-        "      <node hostalias='node1' distribution-key=\"0\"/>\n"
-        "    </nodes>\n"
-        "    <engine>\n"
-        "      <proton>\n"
-        "        <searchable-copies>1</searchable-copies>\n"
-        "      </proton>\n"
-        "    </engine>\n"
-        "  </content>\n\n"
-        "</services>")
+                     "    <nodes>\n"
+                     "      <node hostalias='node1' distribution-key=\"0\"/>\n"
+                     "    </nodes>\n"
+                     "    <engine>\n"
+                     "      <proton>\n"
+                     "        <searchable-copies>1</searchable-copies>\n"
+                     "      </proton>\n"
+                     "    </engine>\n"
+                     "  </content>\n\n"
+                     "</services>")
 
         new_services.write(template)
         new_services.close()
         print(" > Created services.xml at '" + file_path + "'")
 
-
-
     def createHosts_xml(self):
         file_path = self.path + "hosts.xml"
         new_hosts = open(file_path, "w")
-        template = ("<?xml version=\"1.0\" encoding=\"utf-8\" ?>"
-                    "<hosts>"
-                    "  <host name=\"localhost\">"
-                    "    <alias>node1</alias>"
-                    "  </host>" 
+        template = ("<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n"
+                    "<hosts>\n"
+                    "  <host name=\"localhost\">\n"
+                    "    <alias>node1</alias>\n"
+                    "  </host>\n"
                     "</hosts>")
 
         new_hosts.write(template)
         new_hosts.close()
         print(" > Created hosts.xml at '" + file_path + "'")
 
-
     def get_type(self, type):
         return {
             "text": "string",
@@ -193,12 +187,11 @@ class ElasticSearchParser:
             "double": "double",
             "boolean": "string",
             "ip": "text",
-            "byte" : "byte",
-            "float" : "float",
+            "byte": "byte",
+            "float": "float",
 
         }[type]
 
-
     def get_indexing(self, key, key_type):
         if not self._all:
             return "summary"
@@ -211,7 +204,6 @@ class ElasticSearchParser:
 
         return "summary"
 
-
     def walk(self, node, mapping, parent):
         for key, item in node.items():
             if isinstance(item, dict):
@@ -219,10 +211,10 @@ class ElasticSearchParser:
             elif key == "type":
                 mapping[parent] = item
             elif key == "include_in_all":
-                if not item:                    # Field should not be searchable
+                if not item:  # Field should not be searchable
                     self.no_index.append(parent)
             elif key == "index" and parent != "properties":
-                if item == "no":                 # Field should not be searchable
+                if item == "no":  # Field should not be searchable
                     self.no_index.append(parent)
 
 
-- 
cgit v1.2.3