From a3e3d6d0505b2e5cad0b4c6d61ccd04855a6f6c0 Mon Sep 17 00:00:00 2001 From: Henrik Date: Wed, 25 Jul 2018 14:38:29 +0200 Subject: Created parser that takes in ES-documents and ES-mapping and parses them into Vespa-documents --- config-model/src/main/python/ES_Vespa_parser.py | 226 ++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 config-model/src/main/python/ES_Vespa_parser.py diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py new file mode 100644 index 00000000000..019eb463a79 --- /dev/null +++ b/config-model/src/main/python/ES_Vespa_parser.py @@ -0,0 +1,226 @@ +# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +import json +import argparse +import os, sys + + +# noinspection PyInterpreter +class ElasticSearchParser: + + document_file = None + mapping_file = None + search_definitions = {} + path = "" + _all = True + all_mappings = {} + no_index = [] + types = [] + + def __init__(self): + parser = argparse.ArgumentParser() + parser.add_argument("documents", help="location of file with documents to be parsed") + parser.add_argument("mappings", help="location of file with mappings") + args = parser.parse_args() + + self.document_file = args.documents + self.mapping_file = args.mappings + + def main(self): + self.path = os.getcwd() + "/documents_and_document_definitions/" + try: + os.mkdir(self.path, 0755); + print(" > Created folder '" + self.path + "'") + except OSError: + print(" > Folder '"+ self.path +"' already existed") + + + try: + os.mkdir(self.path + "searchdefinitions/", 0755); + print(" > Created folder '" + self.path + "searchdefinitions/" + "'") + except OSError: + print(" > Folder '"+ self.path + "searchdefinitions/" +"' already existed") + + self.parse() + self.createServices_xml() + self.createHosts_xml() + + + + def getMapping(self, type): + unparsed_mapping_file = open(self.mapping_file, "r") + type_mapping = {} + for line in unparsed_mapping_file: + data = json.loads(line) + index = data.keys()[0] + mappings = data[index]["mappings"][type]["properties"] + + #Checking if some fields could be no-index + try: + _all_enabled = data[index]["mappings"][type]["_all"]["enabled"] + if not _all_enabled: + self._all = False + print(" > All fields in the document type '" + type + "' is not searchable. Go inside "+self.path + type + ".sd to add which fields that should be searchable") + except KeyError: + print(" > All fields in the document type '"+type+"' is searchable") + + self.walk(mappings, type_mapping, "properties") + + unparsed_mapping_file.close() + if type not in self.search_definitions: + self.search_definitions[type] = True + self.types.append(type) + self.createSearchDefinition(type, type_mapping) + + # Adding mapping to global map with mappings + self.all_mappings[type] = type_mapping + return type_mapping + + def parse(self): + file_path = self.path + "documents" + ".json" + unparsed_document_file = open(self.document_file, "r") + vespa_docs = open(file_path, "w") + + for line in unparsed_document_file: + data = json.loads(line) + type = data["_type"] + + parsed_data = { + "put": "id:application_name:" + type + "::" + data["_id"], + "fields": {} + } + + # Checking for already existing mapping for a type, if not create a new + if type in self.all_mappings: + mapping = self.all_mappings[type] + else: + mapping = self.getMapping(type) + + for key, item in mapping.items(): + try: + parsed_data["fields"][key] = data["_source"][key] + except KeyError: + continue + + json.dump(parsed_data, vespa_docs) + vespa_docs.write("\n") + + vespa_docs.close() + unparsed_document_file.close() + print(" > Parsed all documents '" + ", ".join(self.types) + "'" + "' at '" + file_path + "'") + + def createSearchDefinition(self, type, type_mapping): + file_path = self.path + "searchdefinitions/" + type + ".sd" + new_sd = open(file_path, "w") + new_sd.write("search "+ type + " {\n") + new_sd.write(" document "+ type + " {\n") + + for key, item in type_mapping.items(): + new_sd.write(" field " + key + " type " + self.get_type(item) + " {\n") + new_sd.write(" indexing: " + self.get_indexing(key, self.get_type(item)) + "\n") + new_sd.write(" }\n") + + new_sd.write(" }\n") + new_sd.write("}\n") + new_sd.close() + print(" > Created search definition for '" + type + "' at '" + file_path + "'") + + def createServices_xml(self): + file_path = self.path + "services.xml" + new_services = open(file_path, "w") + template = ("" + "\n\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n\n" + " \n" + " 1\n" + " \n" + " 1.0\n" + " \n" + " \n") + + for i in range(0,len(self.types)): + template += " \n" + + + template += (" \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " 1\n" + " \n" + " \n" + " \n\n" + "") + + new_services.write(template) + new_services.close() + print(" > Created services.xml at '" + file_path + "'") + + + + def createHosts_xml(self): + file_path = self.path + "hosts.xml" + new_hosts = open(file_path, "w") + template = ("" + "" + " " + " node1" + " " + "") + + new_hosts.write(template) + new_hosts.close() + print(" > Created hosts.xml at '" + file_path + "'") + + + def get_type(self, type): + return { + "text": "string", + "keyword": "string", + "date": "string", + "long": "long", + "double": "double", + "boolean": "string", + "ip": "text", + "byte" : "byte", + "float" : "float", + + }[type] + + + def get_indexing(self, key, key_type): + if not self._all: + return "summary" + + if key not in self.no_index: + if key_type == "string": + return "summary | index" + else: + return "summary | attribute" + + return "summary" + + + def walk(self, node, mapping, parent): + for key, item in node.items(): + if isinstance(item, dict): + self.walk(item, mapping, key) + elif key == "type": + mapping[parent] = item + elif key == "include_in_all": + if not item: # Field should not be searchable + self.no_index.append(parent) + elif key == "index" and parent != "properties": + if item == "no": # Field should not be searchable + self.no_index.append(parent) + + +if __name__ == '__main__': + ElasticSearchParser().main() -- cgit v1.2.3 From f9407be3f00c58d5034465fe8305ba92811630ba Mon Sep 17 00:00:00 2001 From: Henrik Date: Wed, 25 Jul 2018 14:56:57 +0200 Subject: Added description, example of usage and author --- config-model/src/main/python/ES_Vespa_parser.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py index 019eb463a79..91b576d5788 100644 --- a/config-model/src/main/python/ES_Vespa_parser.py +++ b/config-model/src/main/python/ES_Vespa_parser.py @@ -4,7 +4,11 @@ import argparse import os, sys -# noinspection PyInterpreter +# Parsing Elastic Search documents to Vespa documents +# Example of usage: python ES_Vespa_parser.py my_index.json my_index_mapping.json +__author__ = 'henrhoi' + + class ElasticSearchParser: document_file = None -- cgit v1.2.3 From 4a33b491a005ba6ed6f66a5c0a361bd7e67e3c72 Mon Sep 17 00:00:00 2001 From: Henrik Date: Thu, 26 Jul 2018 15:20:28 +0200 Subject: Added arg --application_name, now creating folder for searchdefs ++ --- config-model/src/main/python/ES_Vespa_parser.py | 118 +++++++++++------------- 1 file changed, 55 insertions(+), 63 deletions(-) diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py index 91b576d5788..477b0db4744 100644 --- a/config-model/src/main/python/ES_Vespa_parser.py +++ b/config-model/src/main/python/ES_Vespa_parser.py @@ -3,16 +3,15 @@ import json import argparse import os, sys - # Parsing Elastic Search documents to Vespa documents # Example of usage: python ES_Vespa_parser.py my_index.json my_index_mapping.json __author__ = 'henrhoi' class ElasticSearchParser: - document_file = None mapping_file = None + application_name = None search_definitions = {} path = "" _all = True @@ -22,50 +21,49 @@ class ElasticSearchParser: def __init__(self): parser = argparse.ArgumentParser() - parser.add_argument("documents", help="location of file with documents to be parsed") - parser.add_argument("mappings", help="location of file with mappings") + parser.add_argument("documents_path", help="location of file with documents to be parsed", type=str) + parser.add_argument("mappings_path", help="location of file with mappings", type=str) + parser.add_argument("--application_name", help="name of application", default="application_name", type=str) args = parser.parse_args() - self.document_file = args.documents - self.mapping_file = args.mappings + self.document_file = args.documents_path + self.mapping_file = args.mappings_path + self.application_name = args.application_name def main(self): - self.path = os.getcwd() + "/documents_and_document_definitions/" + self.path = os.getcwd() + "/application/" try: - os.mkdir(self.path, 0755); + os.mkdir(self.path, 0o777) print(" > Created folder '" + self.path + "'") except OSError: - print(" > Folder '"+ self.path +"' already existed") - + print(" > Folder '" + self.path + "' already existed") try: - os.mkdir(self.path + "searchdefinitions/", 0755); + os.makedirs(self.path + "searchdefinitions/", 0o777) print(" > Created folder '" + self.path + "searchdefinitions/" + "'") except OSError: - print(" > Folder '"+ self.path + "searchdefinitions/" +"' already existed") + print(" > Folder '" + self.path + "searchdefinitions/" + "' already existed") self.parse() self.createServices_xml() self.createHosts_xml() - - def getMapping(self, type): unparsed_mapping_file = open(self.mapping_file, "r") type_mapping = {} for line in unparsed_mapping_file: data = json.loads(line) - index = data.keys()[0] + index = list(data.keys())[0] mappings = data[index]["mappings"][type]["properties"] - #Checking if some fields could be no-index + # Checking if some fields could be no-index try: _all_enabled = data[index]["mappings"][type]["_all"]["enabled"] if not _all_enabled: self._all = False - print(" > All fields in the document type '" + type + "' is not searchable. Go inside "+self.path + type + ".sd to add which fields that should be searchable") + print(" > All fields in the document type '" + type + "' is not searchable. Go inside " + self.path + type + ".sd to add which fields that should be searchable") except KeyError: - print(" > All fields in the document type '"+type+"' is searchable") + print(" > All fields in the document type '" + type + "' is searchable") self.walk(mappings, type_mapping, "properties") @@ -89,7 +87,7 @@ class ElasticSearchParser: type = data["_type"] parsed_data = { - "put": "id:application_name:" + type + "::" + data["_id"], + "put": "id:"+self.application_name+":" + type + "::" + data["_id"], "fields": {} } @@ -115,8 +113,8 @@ class ElasticSearchParser: def createSearchDefinition(self, type, type_mapping): file_path = self.path + "searchdefinitions/" + type + ".sd" new_sd = open(file_path, "w") - new_sd.write("search "+ type + " {\n") - new_sd.write(" document "+ type + " {\n") + new_sd.write("search " + type + " {\n") + new_sd.write(" document " + type + " {\n") for key, item in type_mapping.items(): new_sd.write(" field " + key + " type " + self.get_type(item) + " {\n") @@ -132,58 +130,54 @@ class ElasticSearchParser: file_path = self.path + "services.xml" new_services = open(file_path, "w") template = ("" - "\n\n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n" - " \n\n" - " \n" - " 1\n" - " \n" - " 1.0\n" - " \n" - " \n") - - for i in range(0,len(self.types)): - template += " \n" - + "\n\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n\n" + " \n" + " 1\n" + " \n" + " 1.0\n" + " \n" + " \n") + + for i in range(0, len(self.types)): + template += " \n" template += (" \n" - " \n" - " \n" - " \n" - " \n" - " \n" - " 1\n" - " \n" - " \n" - " \n\n" - "") + " \n" + " \n" + " \n" + " \n" + " \n" + " 1\n" + " \n" + " \n" + " \n\n" + "") new_services.write(template) new_services.close() print(" > Created services.xml at '" + file_path + "'") - - def createHosts_xml(self): file_path = self.path + "hosts.xml" new_hosts = open(file_path, "w") - template = ("" - "" - " " - " node1" - " " + template = ("\n" + "\n" + " \n" + " node1\n" + " \n" "") new_hosts.write(template) new_hosts.close() print(" > Created hosts.xml at '" + file_path + "'") - def get_type(self, type): return { "text": "string", @@ -193,12 +187,11 @@ class ElasticSearchParser: "double": "double", "boolean": "string", "ip": "text", - "byte" : "byte", - "float" : "float", + "byte": "byte", + "float": "float", }[type] - def get_indexing(self, key, key_type): if not self._all: return "summary" @@ -211,7 +204,6 @@ class ElasticSearchParser: return "summary" - def walk(self, node, mapping, parent): for key, item in node.items(): if isinstance(item, dict): @@ -219,10 +211,10 @@ class ElasticSearchParser: elif key == "type": mapping[parent] = item elif key == "include_in_all": - if not item: # Field should not be searchable + if not item: # Field should not be searchable self.no_index.append(parent) elif key == "index" and parent != "properties": - if item == "no": # Field should not be searchable + if item == "no": # Field should not be searchable self.no_index.append(parent) -- cgit v1.2.3