Base64ImageJoiner

Join multiple lists of Base64Images into a single list.

# If you need help with the YAML format, have a look at [https://docs.cloud.deepset.ai/v2.0/docs/create-a-pipeline#create-a-pipeline-using-pipeline-editor](https://docs.cloud.deepset.ai/v2.0/docs/create-a-pipeline#create-a-pipeline-using-pipeline-editor).
# This section defines components that you want to use in your pipelines. Each component must have a name and a type. You can also set the component's parameters here.
# The name is up to you, you can give your component a friendly name. You then use components' names when specifying the connections in the pipeline.
# Type is the class path of the component. You can check the type on the component's documentation page.
components:
  BM25Retriever:
    type: haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever
    init_parameters:
      document_store:
        type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
        init_parameters:
          hosts:
          index: ''
          max_chunk_bytes: 104857600
          embedding_dim: 1024
          return_embedding: false
          method:
          mappings:
          settings:
          create_index: true
          http_auth:
          use_ssl:
          verify_certs:
          timeout:
      top_k: 20
      fuzziness: 0

  Embedder:
    type: deepset_cloud_custom_nodes.embedders.nvidia.text_embedder.DeepsetNvidiaTextEmbedder
    init_parameters:
      normalize_embeddings: true
      model: BAAI/bge-m3

  EmbeddingRetriever:
    type: haystack_integrations.components.retrievers.opensearch.embedding_retriever.OpenSearchEmbeddingRetriever
    init_parameters:
      document_store:
        type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
        init_parameters:
          hosts:
          index: ''
          max_chunk_bytes: 104857600
          embedding_dim: 1024
          return_embedding: false
          method:
          mappings:
          settings:
          create_index: true
          http_auth:
          use_ssl:
          verify_certs:
          timeout:
      top_k: 20

  DocumentJoiner:
    type: haystack.components.joiners.document_joiner.DocumentJoiner
    init_parameters:
      join_mode: concatenate

  Ranker:
    type: deepset_cloud_custom_nodes.rankers.nvidia.ranker.DeepsetNvidiaRanker
    init_parameters:
      model: BAAI/bge-reranker-v2-m3
      top_k: 5

  MetaFieldGroupingRanker:
    type: haystack.components.rankers.meta_field_grouping_ranker.MetaFieldGroupingRanker
    init_parameters:
      group_by: file_id
      sort_docs_by: split_id

  FileDownloader:
    type: deepset_cloud_custom_nodes.augmenters.deepset_file_downloader.DeepsetFileDownloader
    init_parameters:
      file_extensions:
      - .pdf
      - .png
      - .jpeg
      - .jpg
      - .gif

  FileToBase64Image:
    type: deepset_cloud_custom_nodes.converters.file_to_image.DeepsetFileToBase64Image
    init_parameters:
      detail: auto

  PDFToBase64Image:
    type: deepset_cloud_custom_nodes.converters.pdf_to_image.DeepsetPDFDocumentToBase64Image
    init_parameters:
      detail: high
      missing_page_number: all_pages

  PromptBuilder:
    type: haystack.components.builders.prompt_builder.PromptBuilder
    init_parameters:
      required_variables: '*'
      template: |
        Answer the questions briefly and precisely using the images and text passages provided.
        Only use images and text passages that are related to the question to answer it.
        Give reasons for your answer.
        In your answer, only refer to images and text passages that are relevant in answering the query.
        Each image is related to exactly one document. You see the images in exactly the same order as the documents.
        Only use references in the form [NUMBER OF IMAGE] if you are using information from an image. 
        Or [NUMBER OF DOCUMENT] if you are using information from a document.
        For example, for Document [1] use the reference [1]. For Image 1 use reference [1] as well.

        These are the documents:
        {%- if documents|length > 0 %}
        {%- for document in documents %}
        Document [{{ loop.index }}] :
        Name of Source File: {{ document.meta.file_name }}
        Relates to image: [{{ loop.index }}]
        {{ document.content }}
        {% endfor -%}
        {%- else %}
        No relevant documents found.
        Respond with "Sorry, no matching documents were found, please adjust the filters or try a different question."
        {% endif %}

        Question: {{ question }}
        Answer: 

  LLM:
    type: deepset_cloud_custom_nodes.generators.openai_vision.DeepsetOpenAIVisionGenerator
    init_parameters:
      api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": false}
      model: gpt-4o
      generation_kwargs:
        max_tokens: 650
        temperature: 0
        seed: 0

  AnswerBuilder:
    type: deepset_cloud_custom_nodes.augmenters.deepset_answer_builder.DeepsetAnswerBuilder
    init_parameters:
      reference_pattern: acm

  MetadataRouter:
    type: haystack.components.routers.metadata_router.MetadataRouter
    init_parameters:
      rules:
        pdf:
          operator: OR
          conditions:
          - field: meta.mime_type
            operator: ==
            value: application/pdf
        image:
          operator: OR
          conditions:
          - field: meta.mime_type
            operator: ==
            value: image/png
          - field: meta.mime_type
            operator: ==
            value: image/jpg
          - field: meta.mime_type
            operator: ==
            value: image/jpeg
          - field: meta.mime_type
            operator: ==
            value: image/gif

  RankSorter:
    type: haystack.components.converters.output_adapter.OutputAdapter
    init_parameters:
      output_type: List[deepset_cloud_custom_nodes.dataclasses.chat_message_with_images.Base64Image]
      unsafe: true
      template: "{{ images|sort(attribute=\"meta._rank\") }}"

  RankAdder:
    type: haystack.components.converters.output_adapter.OutputAdapter
    init_parameters:
      output_type: List[haystack.Document]
      custom_filters: ''
      unsafe: true
      template: |
        {%- for document in documents -%}
          {%- set _ = document.meta.update({'_rank': loop.index}) -%}
        {%- endfor -%}
        {{ documents }}

  Base64ImageJoiner:
    type: deepset_cloud_custom_nodes.joiners.base64_image_joiner.Base64ImageJoiner
    init_parameters: {}

connections:  # Defines how the components are connected
- sender: BM25Retriever.documents
  receiver: DocumentJoiner.documents
- sender: EmbeddingRetriever.documents
  receiver: DocumentJoiner.documents
- sender: PromptBuilder.prompt
  receiver: LLM.prompt
- sender: PromptBuilder.prompt
  receiver: AnswerBuilder.prompt
- sender: Embedder.embedding
  receiver: EmbeddingRetriever.query_embedding
- sender: DocumentJoiner.documents
  receiver: Ranker.documents
- sender: Ranker.documents
  receiver: MetaFieldGroupingRanker.documents
- sender: MetaFieldGroupingRanker.documents
  receiver: FileDownloader.documents
- sender: MetadataRouter.image
  receiver: FileToBase64Image.documents
- sender: MetadataRouter.pdf
  receiver: PDFToBase64Image.documents
- sender: RankSorter.output
  receiver: LLM.images
- sender: LLM.replies
  receiver: AnswerBuilder.replies
- sender: PDFToBase64Image.base64_images
  receiver: Base64ImageJoiner.images
- sender: FileToBase64Image.base64_images
  receiver: Base64ImageJoiner.images
- sender: Base64ImageJoiner.images
  receiver: RankSorter.images
- sender: RankAdder.output
  receiver: MetadataRouter.documents
- sender: FileDownloader.documents
  receiver: RankAdder.documents
- sender: RankAdder.output
  receiver: AnswerBuilder.documents
- sender: RankAdder.output
  receiver: PromptBuilder.documents

inputs:  # Define the inputs for your pipeline
  query:  # These components will receive the query as input
  - "BM25Retriever.query"
  - "PromptBuilder.question"
  - "AnswerBuilder.query"
  - Embedder.text
  - Ranker.query
  filters:  # These components will receive a potential query filter as input
  - "BM25Retriever.filters"
  - "EmbeddingRetriever.filters"
  files:
  - FileDownloader.sources

outputs:  # Defines the output of your pipeline
  documents: "RankAdder.output"           # The output of the pipeline is the retrieved documents
  answers: "AnswerBuilder.answers"   # The output of the pipeline is the generated answers

max_runs_per_component: 100

metadata: {}