DeepsetDocumentMetadataPreProcessor

Process and transform metadata.

components:
  file_classifier:
    type: haystack.components.routers.file_type_router.FileTypeRouter
    init_parameters:
      mime_types:
      - text/plain
      - application/pdf
      - text/markdown
      - text/html
      - application/vnd.openxmlformats-officedocument.wordprocessingml.document
      - application/vnd.openxmlformats-officedocument.presentationml.presentation
      - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
      - text/csv

  text_converter:
    type: haystack.components.converters.txt.TextFileToDocument
    init_parameters:
      encoding: utf-8

  pdf_converter:
    type: haystack.components.converters.pdfminer.PDFMinerToDocument
    init_parameters:
      line_overlap: 0.5
      char_margin: 2
      line_margin: 0.5
      word_margin: 0.1
      boxes_flow: 0.5
      detect_vertical: true
      all_texts: false
      store_full_path: false

  markdown_converter:
    type: haystack.components.converters.txt.TextFileToDocument
    init_parameters:
      encoding: utf-8

  html_converter:
    type: haystack.components.converters.html.HTMLToDocument
    init_parameters:
      extraction_kwargs:
        output_format: markdown
        target_language:
        include_tables: true
        include_links: true

  docx_converter:
    type: haystack.components.converters.docx.DOCXToDocument
    init_parameters:
      link_format: markdown

  pptx_converter:
    type: haystack.components.converters.pptx.PPTXToDocument
    init_parameters: {}

  xlsx_converter:
    type: haystack.components.converters.xlsx.XLSXToDocument
    init_parameters: {}

  csv_converter:
    type: haystack.components.converters.csv.CSVToDocument
    init_parameters:
      encoding: utf-8

  joiner:
    type: haystack.components.joiners.document_joiner.DocumentJoiner
    init_parameters:
      join_mode: concatenate
      sort_by_score: false

  joiner_xlsx:
    type: haystack.components.joiners.document_joiner.DocumentJoiner
    init_parameters:
      join_mode: concatenate
      sort_by_score: false

  splitter:
    type: haystack.components.preprocessors.document_splitter.DocumentSplitter
    init_parameters:
      split_by: word
      split_length: 250
      split_overlap: 30
      respect_sentence_boundary: true
      language: en

  document_embedder:
    type: deepset_cloud_custom_nodes.embedders.nvidia.document_embedder.DeepsetNvidiaDocumentEmbedder
    init_parameters:
      normalize_embeddings: true
      model: intfloat/e5-base-v2

  writer:
    type: haystack.components.writers.document_writer.DocumentWriter
    init_parameters:
      document_store:
        type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
        init_parameters:
          hosts:
          index: ''
          max_chunk_bytes: 104857600
          embedding_dim: 768
          return_embedding: false
          method:
          mappings:
          settings:
          create_index: true
          http_auth:
          use_ssl:
          verify_certs:
          timeout:
      policy: OVERWRITE

  DeepsetDocumentMetadataPreProcessor:
    type: deepset_cloud_custom_nodes.preprocessors.document_metadata_preprocessor.DeepsetDocumentMetadataPreProcessor
    init_parameters:
      replace_fields: "\\n"
      convert_meta_to_content: true
      meta_fields_to_convert: judge
      line_prefix: '- '
      debug: false

connections:
- sender: file_classifier.text/plain
  receiver: text_converter.sources
- sender: file_classifier.application/pdf
  receiver: pdf_converter.sources
- sender: file_classifier.text/markdown
  receiver: markdown_converter.sources
- sender: file_classifier.text/html
  receiver: html_converter.sources
- sender: file_classifier.application/vnd.openxmlformats-officedocument.wordprocessingml.document
  receiver: docx_converter.sources
- sender: file_classifier.application/vnd.openxmlformats-officedocument.presentationml.presentation
  receiver: pptx_converter.sources
- sender: file_classifier.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
  receiver: xlsx_converter.sources
- sender: file_classifier.text/csv
  receiver: csv_converter.sources
- sender: text_converter.documents
  receiver: joiner.documents
- sender: pdf_converter.documents
  receiver: joiner.documents
- sender: markdown_converter.documents
  receiver: joiner.documents
- sender: html_converter.documents
  receiver: joiner.documents
- sender: docx_converter.documents
  receiver: joiner.documents
- sender: pptx_converter.documents
  receiver: joiner.documents
- sender: splitter.documents
  receiver: joiner_xlsx.documents
- sender: xlsx_converter.documents
  receiver: joiner_xlsx.documents
- sender: csv_converter.documents
  receiver: joiner_xlsx.documents
- sender: joiner_xlsx.documents
  receiver: document_embedder.documents
- sender: document_embedder.documents
  receiver: writer.documents
- sender: joiner.documents
  receiver: DeepsetDocumentMetadataPreProcessor.documents
- sender: DeepsetDocumentMetadataPreProcessor.documents
  receiver: splitter.documents

inputs:
  files:
  - file_classifier.sources

max_runs_per_component: 100

metadata: {}