diff options
author | Henrik Høiness <31851923+henrhoi@users.noreply.github.com> | 2018-07-30 09:18:54 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-30 09:18:54 +0200 |
commit | 0a5e370b59373d8147e2a8e682bc3296eec0d639 (patch) | |
tree | 92d4eab51b3894efc24fe0ac24131e4ad79dda5f | |
parent | 7e6b0646c8567dfdd7b89295f9c288dce01f8d6f (diff) | |
parent | 4a33b491a005ba6ed6f66a5c0a361bd7e67e3c72 (diff) |
Merge pull request #6467 from vespa-engine/henrhoi/es_vespa_parser
henrhoi/es_vespa_parser
-rw-r--r-- | config-model/src/main/python/ES_Vespa_parser.py | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/config-model/src/main/python/ES_Vespa_parser.py b/config-model/src/main/python/ES_Vespa_parser.py new file mode 100644 index 00000000000..477b0db4744 --- /dev/null +++ b/config-model/src/main/python/ES_Vespa_parser.py @@ -0,0 +1,222 @@ +# Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +import json +import argparse +import os, sys + +# Parsing Elastic Search documents to Vespa documents +# Example of usage: python ES_Vespa_parser.py my_index.json my_index_mapping.json +__author__ = 'henrhoi' + + +class ElasticSearchParser: + document_file = None + mapping_file = None + application_name = None + search_definitions = {} + path = "" + _all = True + all_mappings = {} + no_index = [] + types = [] + + def __init__(self): + parser = argparse.ArgumentParser() + parser.add_argument("documents_path", help="location of file with documents to be parsed", type=str) + parser.add_argument("mappings_path", help="location of file with mappings", type=str) + parser.add_argument("--application_name", help="name of application", default="application_name", type=str) + args = parser.parse_args() + + self.document_file = args.documents_path + self.mapping_file = args.mappings_path + self.application_name = args.application_name + + def main(self): + self.path = os.getcwd() + "/application/" + try: + os.mkdir(self.path, 0o777) + print(" > Created folder '" + self.path + "'") + except OSError: + print(" > Folder '" + self.path + "' already existed") + + try: + os.makedirs(self.path + "searchdefinitions/", 0o777) + print(" > Created folder '" + self.path + "searchdefinitions/" + "'") + except OSError: + print(" > Folder '" + self.path + "searchdefinitions/" + "' already existed") + + self.parse() + self.createServices_xml() + self.createHosts_xml() + + def getMapping(self, type): + unparsed_mapping_file = open(self.mapping_file, "r") + type_mapping = {} + for line in unparsed_mapping_file: + data = json.loads(line) + index = list(data.keys())[0] + mappings = data[index]["mappings"][type]["properties"] + + # Checking if some fields could be no-index + try: + _all_enabled = data[index]["mappings"][type]["_all"]["enabled"] + if not _all_enabled: + self._all = False + print(" > All fields in the document type '" + type + "' is not searchable. Go inside " + self.path + type + ".sd to add which fields that should be searchable") + except KeyError: + print(" > All fields in the document type '" + type + "' is searchable") + + self.walk(mappings, type_mapping, "properties") + + unparsed_mapping_file.close() + if type not in self.search_definitions: + self.search_definitions[type] = True + self.types.append(type) + self.createSearchDefinition(type, type_mapping) + + # Adding mapping to global map with mappings + self.all_mappings[type] = type_mapping + return type_mapping + + def parse(self): + file_path = self.path + "documents" + ".json" + unparsed_document_file = open(self.document_file, "r") + vespa_docs = open(file_path, "w") + + for line in unparsed_document_file: + data = json.loads(line) + type = data["_type"] + + parsed_data = { + "put": "id:"+self.application_name+":" + type + "::" + data["_id"], + "fields": {} + } + + # Checking for already existing mapping for a type, if not create a new + if type in self.all_mappings: + mapping = self.all_mappings[type] + else: + mapping = self.getMapping(type) + + for key, item in mapping.items(): + try: + parsed_data["fields"][key] = data["_source"][key] + except KeyError: + continue + + json.dump(parsed_data, vespa_docs) + vespa_docs.write("\n") + + vespa_docs.close() + unparsed_document_file.close() + print(" > Parsed all documents '" + ", ".join(self.types) + "'" + "' at '" + file_path + "'") + + def createSearchDefinition(self, type, type_mapping): + file_path = self.path + "searchdefinitions/" + type + ".sd" + new_sd = open(file_path, "w") + new_sd.write("search " + type + " {\n") + new_sd.write(" document " + type + " {\n") + + for key, item in type_mapping.items(): + new_sd.write(" field " + key + " type " + self.get_type(item) + " {\n") + new_sd.write(" indexing: " + self.get_indexing(key, self.get_type(item)) + "\n") + new_sd.write(" }\n") + + new_sd.write(" }\n") + new_sd.write("}\n") + new_sd.close() + print(" > Created search definition for '" + type + "' at '" + file_path + "'") + + def createServices_xml(self): + file_path = self.path + "services.xml" + new_services = open(file_path, "w") + template = ("<?xml version='1.0' encoding='UTF-8'?>" + "<services version='1.0'>\n\n" + " <container id='default' version='1.0'>\n" + " <search/>\n" + " <document-api/>\n" + " <nodes>\n" + " <node hostalias='node1'/>\n" + " </nodes>\n" + " </container>\n\n" + " <content id='content' version='1.0'>\n" + " <redundancy>1</redundancy>\n" + " <search>\n" + " <visibility-delay>1.0</visibility-delay>\n" + " </search>\n" + " <documents>\n") + + for i in range(0, len(self.types)): + template += " <document mode='index' type='" + self.types[i] + "'/>\n" + + template += (" </documents>\n" + " <nodes>\n" + " <node hostalias='node1' distribution-key=\"0\"/>\n" + " </nodes>\n" + " <engine>\n" + " <proton>\n" + " <searchable-copies>1</searchable-copies>\n" + " </proton>\n" + " </engine>\n" + " </content>\n\n" + "</services>") + + new_services.write(template) + new_services.close() + print(" > Created services.xml at '" + file_path + "'") + + def createHosts_xml(self): + file_path = self.path + "hosts.xml" + new_hosts = open(file_path, "w") + template = ("<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n" + "<hosts>\n" + " <host name=\"localhost\">\n" + " <alias>node1</alias>\n" + " </host>\n" + "</hosts>") + + new_hosts.write(template) + new_hosts.close() + print(" > Created hosts.xml at '" + file_path + "'") + + def get_type(self, type): + return { + "text": "string", + "keyword": "string", + "date": "string", + "long": "long", + "double": "double", + "boolean": "string", + "ip": "text", + "byte": "byte", + "float": "float", + + }[type] + + def get_indexing(self, key, key_type): + if not self._all: + return "summary" + + if key not in self.no_index: + if key_type == "string": + return "summary | index" + else: + return "summary | attribute" + + return "summary" + + def walk(self, node, mapping, parent): + for key, item in node.items(): + if isinstance(item, dict): + self.walk(item, mapping, key) + elif key == "type": + mapping[parent] = item + elif key == "include_in_all": + if not item: # Field should not be searchable + self.no_index.append(parent) + elif key == "index" and parent != "properties": + if item == "no": # Field should not be searchable + self.no_index.append(parent) + + +if __name__ == '__main__': + ElasticSearchParser().main() |