3  Vector Database

(ns notebooks.vector-database
  (:require [tablecloth.api :as tc]
            [notebooks.preparation :refer [ds]]
            [scicloj.kindly.v4.kind :as kind]
            [scicloj.tableplot.v1.plotly :as plotly])
  (:import (dev.langchain4j.data.segment TextSegment)
           (dev.langchain4j.store.embedding.inmemory InMemoryEmbeddingStore)
           (dev.langchain4j.model.embedding.onnx.allminilml6v2 AllMiniLmL6V2EmbeddingModel)
           (smile.manifold TSNE)))

In this section we will:

  1. Build an in-memory vector database using langchain4j

  2. Explore visualisations of this database to get an intuition about how the data is stored

3.1 Building a Vector Database from the Questions

3.1.1 Vector Embeddings

First, let’s separate out all of the questions from the original dataset, and store these under the name ‘questions-list’. We will also take a peek at the first question as a reminder of the type of data we will be storing.

(def questions-list (:question ds))
(kind/pprint (first questions-list))
["Deputy Rose Conway-Walsh asked the Taoiseach if he will provide an update on the legislative programme. "]

Next, let’s see what an vector embedding of this single question looks like. To do this, we have to first define a model that will perform the translation of the text into a vector of numbers. We will use langchain4j’s embedding model for this.

(def embedding-model (AllMiniLmL6V2EmbeddingModel/new))
(defn text->embedding [text]
  (->> (TextSegment/from text)
       (. embedding-model embed)))
(def sample-question-embedding (text->embedding (first questions-list)))
(kind/pprint (-> sample-question-embedding
                 .content
                 .vector))
[[-0.051565964, 0.04267587, -9.377065E-4, -0.0030915863, -0.017806293,
  0.014194333, -0.018429384, -0.013350692, -0.07762441, -0.012239637,
  -0.0529695, -0.040091537, -0.081397325, -0.056254886, 0.03193489,
  0.08905221, 0.0022227038, -0.0564618, -0.04947394, 0.02279298,
  -0.00782095, 0.03845661, 0.0068886885, 0.03738498, 0.04981824,
  -0.041129097, -0.037626088, -0.03011126, 0.046217047, -0.05281599,
  0.03244865, 0.045919325, 0.008503602, 0.03687103, -0.0034759927,
  -0.002851494, 0.034838345, 0.0011097376, -0.0037185736, 0.03027001,
  -0.0048531475, -0.07427281, -0.093037955, -0.0138438605, -0.0308969,
  0.023753913, 0.035950407, -0.021267619, -0.05879217, -0.09269644,
  -0.043296475, -0.095704004, 0.0029093265, 0.044089917, 0.030580506,
  -0.014015126, 0.061709005, 0.065654665, -0.009307033, 0.010860614,
  -0.06259037, 0.0117607, -0.0576449, 0.113929085, 0.023223788,
  0.016540453, 0.019812055, -0.030971108, -0.010054616, -0.15600711,
  -0.031770885, 0.0024046933, 0.060362175, -0.07228675, -0.04337646,
  -0.09876244, 0.04153598, 0.037470736, 0.07756216, -0.070215054,
  -0.060134128, 0.013998699, -0.016173538, 0.029805055, 0.07070556,
  -0.01507618, -0.03835861, 0.048175313, -0.06727677, -0.010244839,
  0.028810982, 0.0390896, 0.0055721314, 0.032842655, -0.05275868,
  0.13629042, -0.014003614, -0.04419636, -0.032983344, 0.09565612,
  0.05393285, 0.0910637, 0.026580364, -0.009264091, -0.04118206,
  0.011564885, 0.08702904, 0.04028348, -0.010034251, 0.0054587405,
  0.048542187, -0.029306246, -0.019607924, -0.047698632, -0.063202776,
  -0.09643034, 1.0316832E-4, 0.06744163, -0.09687647, -0.04674699,
  0.0058854735, 0.034095988, 0.019235175, -0.018001923, -0.0034221995,
  -0.020554045, -0.012234825, -3.6309132E-33, 0.04463269, 0.030338792,
  -0.03187322, -0.0048437365, 0.07475557, 0.012541492, 0.044128098,
  0.016762326, 0.04042982, -0.05459035, 0.058134697, 0.035715867,
  -0.03255846, 0.04439513, -0.08283844, -0.024363296, -0.09230695,
  -0.0071303807, -0.023412947, -0.032000907, 0.07322967, -0.06958443,
  -0.04361295, -0.04767589, 0.11027227, -0.053116444, 0.044424597,
  0.012289628, 0.03139631, 0.033425335, 0.026040848, -0.002710267,
  -0.055162344, -0.09886357, -4.5965874E-4, 0.016635926, -0.053111225,
  -0.010389073, -0.011350824, -0.038635246, 0.07972643, 0.020848827,
  0.025916813, 0.008766708, -0.012872535, 0.0054667974, 0.009009987,
  0.00592326, 0.073603116, -0.07838817, 0.020052757, -0.07254255,
  -0.056754358, -0.0543825, 0.0574838, -0.019832866, -0.03909094,
  -0.0054699373, 0.031185323, 0.01972184, 0.005599661, -0.01614101,
  -0.0040073236, 0.01996001, -0.05492752, -0.03275866, -0.04975333,
  0.01972705, 0.079426415, -0.069795646, -0.0517581, -0.02004581,
  -0.016190957, 0.0057196477, -0.05563514, -0.035618022, -0.092297554,
  -0.02072491, 0.073071025, -0.038475942, 0.07049898, -0.033629708,
  -0.020644072, 0.110064104, 0.058032416, 0.016730338, 0.08066387,
  0.09109327, -0.024225721, 0.03415113, -0.01893194, -0.014609733,
  0.049713243, 0.011413008, 0.07755695, -1.3411656E-34, -0.05232376,
  0.044075064, -0.09067795, -0.039530728, 0.034150038, -0.019966424,
  0.0014640653, -0.033090312, 0.051464304, 0.012076632, 0.038269863,
  0.031790987, -0.05497796, 0.041764397, -0.011204656, -0.04828543,
  -0.061260473, -0.06796844, 0.028790096, 0.074365325, -0.047738094,
  0.058247678, -0.06664268, 0.022542724, 0.030771937, 0.032335028,
  0.063835114, 0.01098816, 0.048588622, -0.0074292547, -0.08455504,
  -0.09686805, -0.11446031, 0.038620643, -0.018000288, -0.005783862,
  0.117290996, -0.04332185, -0.0918883, 0.04227893, 0.12735373,
  -0.058071226, 0.008141146, 0.02378829, -0.004828894, 0.03734089,
  -0.044890318, 0.037734233, -0.045360208, 0.019411176, 0.020944586,
  0.025412127, 0.025040533, 0.061652146, 0.03536717, -0.023617806,
  -0.046533704, 0.020816347, 0.07020258, -0.08555988, -0.08552976,
  0.12119743, 0.033828687, -0.04529947, 0.0726635, 0.019280046,
  -0.052829847, -0.0561605, 0.12183744, 0.040792726, -0.014878135,
  -0.045390513, -0.004088519, -0.124109335, -0.005920941, 0.030258713,
  0.030699698, -0.12028908, -0.012539742, -0.020336179, 0.053280458,
  0.03068848, 0.011678796, -0.019068768, -0.023170982, -0.04910983,
  0.07749273, -0.017666962, 0.038362473, 0.012553131, -0.043134313,
  -0.015001261, -0.05271599, -0.053050105, -0.0047275033, -2.522037E-8,
  -0.0060613886, 0.057002496, -0.0063224407, 0.016500616, 0.06869513,
  -0.020051625, -0.050884914, -0.04872227, -0.035153884, -0.058405083,
  0.075425535, 0.06699011, 0.0998922, -0.0014380281, 0.06891766,
  -0.056399353, -0.031794406, -0.085502684, -0.005886522, -0.055097133,
  -0.01956806, 0.040903255, 0.02040228, 0.10583605, 0.0389902,
  0.018493123, -0.032337867, 0.03424851, -0.0490942, 6.581282E-4,
  0.07262681, 0.11719072, 0.06728717, 0.019798033, 0.0982596,
  -0.064961344, -0.075948685, -0.018207854, 0.058165018, -0.078717746,
  0.031760685, -0.01400299, 0.06883344, 0.020258209, 0.027536964,
  -0.10173048, 0.05966868, -0.025365265, 0.057640944, -0.07237203,
  -0.008312521, -0.04669108, 0.048200443, -0.007128292, 0.036636632,
  0.018558005, -0.014675374, -0.038298365, -0.032773625, -0.0011454803,
  0.05334099, -0.07009548, -0.0065492177, -0.066353604]]

We can see it is just an array of floating point numbers. Let’s see how large the vector embedding is:

(-> sample-question-embedding
    .content
    .vector
    count)
384

In order to store the embeddings in a database, we will use an in-memory database provided by langchain4j. There are also options for using a more robust solution like postgres, but for testing/exploration purposes, an in-memory database will do.

We will first define a short function for adding a question to a memory store:

(defn add-doc-to-store [question store]
  (let [segment (TextSegment/from question)
        embedding (->> segment (. embedding-model embed) (.content))]
    (. store add embedding segment)))

At this stage, we would define a new store and add the questions. The in-memory database store function provided by langchain4j also contains an option to convert the data to json. For performance/convenience reasons, I have already pre-made that json file and the db-store-questions variable below simply reads from that file to load the database into memory. I have left the code used to generate the json file in as a comment for reference. You can evaluate the code within this comment if you want to re-build the database.

(comment
  (let [db-store (InMemoryEmbeddingStore/new)
        entries-added (count (map #(add-question-to-store! % db-store) questions-list))]
    (do
      (spit "data/retrieval_store/db-store-questions.json" (.serializeToJson db-store))
      (println (str entries-added " records serialised to a json file at data/db-store-questions.json")))))
(def db-store-questions (InMemoryEmbeddingStore/fromFile "data/retrieval_store/db-store-questions.json"))

3.1.2 Testing Question Lookup

This function takes in some ‘text’ as a query, and returns ‘n’ number of similar questions, along with their similarity score.

(defn query-db-store [text n]
  (let [query (.content (. embedding-model embed text))
        result (. db-store-questions findRelevant query n)]
    (map (fn [entry]
           {:text (.text (.embedded entry))
            :score (.score entry)})
         result)))

As a test question, I’ll take an actual question that was asked at a point in time later than that captured by the dataset. It’s a question about electric vehicle charging points at airports.

(def test-question-1 "Deputy Emer Currie asked the Minister for Transport his plans to expand EV charging points at the State's airports to facilitate more EV drivers and an increase in EV car rental; and if he will make a statement on the matter.")
(kind/md test-question-1)

Deputy Emer Currie asked the Minister for Transport his plans to expand EV charging points at the State’s airports to facilitate more EV drivers and an increase in EV car rental; and if he will make a statement on the matter.

Now, let’s see what questions are similar.

(kind/table
 (query-db-store test-question-1 5))
text score
Deputy Jennifer Whitmore asked the Minister for Transport the number of EV charging points delivered, by county, in 2022 and 2023, respectively; and if he will make a statement on the matter. 0.8443775651039741
Deputy Fergus O'Dowd asked the Minister for Transport if any consideration has been given to extending the low emissions vehicle toll incentive since it ceased at the end of December to continue to incentivise the transition to EVs; and if he will make a statement on the matter. 0.8312304815949974
Deputy Patrick Costello asked the Minister for Transport further to Parliamentary Question No. 301 of 17 January 2024 (details supplied), if the plan referenced includes keeping regular non-rapid charging points which many batteries on older model EVs are not equipped for. 0.829099570830286
Deputy Aengus Ó Snodaigh asked the Minister for Transport what the national strategy is to roll out more high-speed charging points at service stations or laybys along the State's motorways and national road network to facilitate electric car users and to encourage others to switch to EVs. 0.8201042488965056
Deputy Robert Troy asked the Minister for Transport if he will provide an update on the EU Just Transition Fund community facilities EV charging point scheme; when an announcement will be made on this scheme and can he ensure an application (details supplied) is approved for funding. 0.8068965874069679

As we can see, there appear to be no pre-existing questions within the timeframe of the dataset that relate specifically to EV charging at airports. However, we were able to retrieve questions that generally related to EV charging.

Let’s try with a question that is similar to something already in the database.

(-> ds
    (tc/select-rows #(= (% :topic) "Passport Services"))
    :question
    (nth 10)
    (kind/md))

Deputy Bernard J. Durkan asked the Tánaiste and Minister for Foreign Affairs the extent to which he can put in place an effective turn around for applications for passport renewals or new passport applications; and if he will make a statement on the matter.

(def test-question-2 "The Deputy asked the Minister for Foreign Affairs if he can put in place an effective process for applications for passport renewals or new passport applications")
(kind/table
 (query-db-store test-question-2 5))
text score
Deputy Bernard J. Durkan asked the Tánaiste and Minister for Foreign Affairs the extent to which he can put in place an effective turn around for applications for passport renewals or new passport applications; and if he will make a statement on the matter. 0.9098782341205796
Deputy Alan Dillon asked the Minister for Justice if a passport application by a person (details supplied) will be reviewed and expedited; and if she will make a statement on the matter. 0.8612346137136901
Deputy Éamon Ó Cuív asked the Tánaiste and Minister for Foreign Affairs when an application for a passport renewal will be completed (details supplied); the reason for the delay in issuing said passport; and if he will make a statement on the matter. 0.8522896890912129
Deputy Niamh Smyth asked the Tánaiste and Minister for Foreign Affairs if a passport application (details supplied) will be expedited; and if he will make a statement on the matter. 0.8507559107527245
Deputy James Lawless asked the Tánaiste and Minister for Foreign Affairs if a passport application (details supplied) will be expedited; and if he will make a statement on the matter. 0.8469334066937866

We can see that we do indeed return the matching question, along with other questions relating to individuals who are experiencing delay’s with their passport applications.

3.1.3 Answering a Question

Let’s finally use this method to return the best answers based on the question given. This approach could be later used to provide context for a RAG model that would generate answers.

(defn get-answers-for-question [question]
  (let [matching-questions (map :text (query-db-store question 5))]
    (-> ds
        (tc/select-rows #(some #{(% :question)} matching-questions))
        (tc/select-columns [:answer :date]))))

As we can see in the table below, we also capture the date the question was answered, and this could be useful in the context of a RAG model when we might perhaps want to return a less similar, but more recent answer.

(get-answers-for-question test-question-2)

data/20250302_PQs_10K_2024_answers.csv [5 2]:

:answer :date
The Passport Service is successfully meeting high demand for passports. All turnaround times are at or well-ahead of target and there are no backlogs. Currently the majority of child online renewal applications are issuing within 6 days and online adult renewal applications are issuing within 2 working days. First-time online applications are being processed within 18-19 days. The advertised turnaround times are as follows: My Department is currently running an extensive communications campaign to encourage passport applicants to apply early and to apply online. This campaign has been very successful in increasing the numbers of applications received in January 2024 and will help to ensure that those citizens who need to renew their passport this year or apply for a passport for a child will have their passports well before the traditional summer peak season. The Passport Online service offers Irish citizens the ability to apply online for their passport 24 hours a day, 7 days a week. It is a user-friendly, efficient service that consistently offers processing times up to 4 times faster than paper-based passport renewal applications. The Passport Service is well-resourced to meet forecasted demand for 2024. My Department has implemented a staffing plan to ensure that excellent customer service is maintained throughout periods of peak demand. There are currently 800 staff working in the Passport Service and my Department is continuing to work to increase resources within the service. Currently, 100% of calls to the Customer Service Hub are being answered and customers can easily access the information they need through the Hub. The Customer Service Hub responds to an average of 10,000 queries per week. In 2023, the Passport Service was ranked 18th out of 150 public-facing organisations in a survey conducted by the Customer Experience (CX) group. This ranking recognised the Passport Service as the best customer service brand in the public sector. In addition, the Customer Service Hub in the Passport Service was shortlisted for the “Best Customer Service Team” award at the 2023 Customer Experience Awards. The public offices of the Passport Service in Dublin 2 and in Cork operate a four-day Urgent Appointment service for renewal applicants. The Dublin office also offers a one-day Urgent Appointment service. 2024 is expected to be another busy year for the Passport Service and the Passport Service is in an excellent position to meet the demand forecasted for this year. I would ask the Deputy to encourage his constituents to apply for their passports through Passport Online, as the quickest, easiest and most cost effective way to apply for a passport. 2024-01-25
I propose to take Questions Nos. 17, 23 and 35 together. With regard to the specific applications about which the Deputies have enquired, the Passport Service has issued passports to the applicants. 2024-01-17
With regard to the specific application about which the Deputy has enquired, the standard turnaround time for child renewal applications is 15 working days from the date the Passport Service receives the required supporting documents. The applicant’s supporting documents were received on 22 January 2024. This application is within the standard turnaround time and has not yet reached its issue-by date. 2024-02-07
The granting of Irish citizenship through naturalisation is governed by the provisions of the Irish Nationality and Citizenship Act 1956, as amended. All applications for a certificate of naturalisation are processed in line with the eligibility criteria as set out under this Act. The naturalisation application from the person referred to by the Deputy was not progressed due to lack of engagement; specifically, the failure to provide a copy of the applicant’s passport. I am advised the Citizenship Division will seek to make further contact with the applicant directly in this regard. 2024-02-07
With regard to the specific application about which the Deputy has enquired, this application has been approved. 2024-03-20

3.2 Visualising with tSNE

As we saw above, our questions are embedded in the database as a vector of numbers. In a certain sense, we can think of these numbers as coordinates within a high-dimensional space.

What the ‘t-distributed stochastic neighbor embedding’ (t-SNE) method allows us to do is reduce these high-dimensional vectors into a 2d or 3d set of coordinates.

A nice analogy of what is happening here is when the 3d/spherical space of the Earth’s surface is translated into a 2d map. The importance of this analogy lies in the caveats that accopany these 2d representations of the earth. Different projections can convey different significations, a point that his highlighted in a scene from the West Wing where the “Cartographers for social equality” give a presentation to the staff on the socio-political implications of different projection techniques. - https://www.youtube.com/watch?v=AMfXVWFBrVo

Similarly, with t-SNE and related methods, we have to remain conscious of the limitations of this kind of approach. There is a nice blog post about some of the pitfalls possible when creating t-SNE plots, as well as podcast episode discussing the limitation of UMAP, a similar dimension-reduction method.

In the case below, we are using this technique to help illustrate the relative proximity of questions in the database. For example, in the above cases where we find matching questions, are these matches ‘neighbours’ in a spatial sense? But as with the above caveats, we shouldn’t read too much into things like density or clusters or the magnitute of distance. At the same time might help us with our intution around how ‘similarity’ works across the dataset. It is also interesting to explore what kind of tools we can use within clojure!

We will first look at wheather ‘topic’ areas appear related within this coordinate space, and then will try map where a sample question and its matches lie.

3.2.1 Topic Areas Projection

We will take the three topic areas we already used above:

  • Schools Building Projects

  • International Protection

  • Health Services

In theory, questions relating to these topics should be close to one another within the embedding space.

(def ds-schools-immigration-health
  (-> ds
      (tc/select-rows #(some #{(:topic %)} ["Schools Building Projects" "International Protection" "Health Services"]))
      (tc/select-columns [:question :topic])
      (tc/map-columns :embedding [:question] (fn [q]
                                               (->> (TextSegment/from q)
                                                    (. embedding-model embed)
                                                    (.content)
                                                    (.vector)
                                                    vec)))))

To reduce the embedding to a 2d space, we will use the Smile implementation of t-SNE.

This function takes in 4 parameters as arguments:

  • number of dimensions

  • perplexity

  • learning rate

  • iterations

The two most impactful variables are ‘perplexity’ and ‘iterations’. Perplexity is something like the expected group ‘size’. For example, if you had a dataset with a hundred points and a perpexity of 100, the alorithm would try to keep these points close together in the coordinate space.

We also need to use tablecloth’s transformation of the data to ‘double-arrays’ to transform the data to a datatype expected by the model.

(defn make-t-sne-coords [dataset {:keys [dimensions perplexity learning-rate iterations]
                                  :or {dimensions 2
                                       perplexity 25
                                       learning-rate 200
                                       iterations 1000}}]
  (let [ds (-> (into [] (:embedding dataset))
               (tc/dataset)
               (tc/rows :as-double-arrays)
               (TSNE. dimensions perplexity learning-rate iterations)
               (. coordinates)
               (tc/dataset))]
    (if (= dimensions 2)
      (tc/rename-columns ds [:x :y])
      (tc/rename-columns ds [:x :y :z]))))
(defn plot-t-sne-coords [ds labels t-sne-opts plot-opts]
  (-> ds
      (make-t-sne-coords t-sne-opts)
      (tc/add-column :label labels)
      (plotly/base {:=width 700})
      (plotly/layer-point plot-opts)))
(plot-t-sne-coords ds-schools-immigration-health
                   (:topic ds-schools-immigration-health)
                   {:iterations 100}
                   {:=color :label})
(plot-t-sne-coords ds-schools-immigration-health
                   (:topic ds-schools-immigration-health)
                   {:iterations 500}
                   {:=color :label})
(plot-t-sne-coords ds-schools-immigration-health
                   (:topic ds-schools-immigration-health)
                   {:iterations 1000}
                   {:=color :label})
(plot-t-sne-coords ds-schools-immigration-health
                   (:topic ds-schools-immigration-health)
                   {:iterations 1000
                    :perplexity 3}
                   {:=color :label})
(plot-t-sne-coords ds-schools-immigration-health
                   (:topic ds-schools-immigration-health)
                   {:iterations 1000
                    :perplexity 300}
                   {:=color :label})

After 100 iterations (quite early in the process), there isn’t much separation at all. After 500 iterations, the points begin to split apart. Wildly varying the perplexity doesn’t seem to have a huge imapact on this visualisation.

3.2.2 Visualising Question Retrieval

For this exercise we’ll use a smaller subset of the questions.

(def ds-subset
  (-> ds
      (tc/select-rows (range 1000))))

We’ll also add the qestion embeddings to this subset.

(defonce ds-subset-with-embeddings
  (-> ds-subset
      (tc/map-columns :embedding [:question]
                      (fn [q]
                        (->> (text->embedding q)
                             (.content)
                             (.vector)
                             vec)))))
(def sample-question "The Deputy asks the Minister a question about the number of vacant houses")

Next we’ll create a temporary store for these questions, and add them to it.

(def questions-subset-store (InMemoryEmbeddingStore/new))
(count (map #(add-doc-to-store % questions-subset-store) (:question ds-subset)))
1000

This function, similar to ones above, returns the 5 closest matching questions. These will be what we try to visualise.

(defn get-matching-questions [question n]
  (let [query (.content (. embedding-model embed question))
        matches (. questions-subset-store findRelevant query n)]
    (map (fn [entry]
           {:question (.text (.embedded entry))
            :similarity (.score entry)})
         matches)))
(def q-matches (get-matching-questions sample-question 5))

Next, we will add custom labels to the data. We will also add our sample question into the dataset and label it accordingly. The labels will be:

  • ‘Default’ - the quesitons that haven’t been matched
  • ‘Match’ - one of the 5 matching questions
  • ‘Question’ - the question itself

3.2.2.1 2D Projection

(defn add-label [ds-question matching-qs]
  (if (some #{ds-question} (mapv :question matching-qs))
    "Match"
    "Default"))
(defn question-retrieval-visulation-data [question]
  (let [matches     (get-matching-questions question 5)
        q-embedding (-> (text->embedding question)
                        (.content)
                        (.vector)
                        vec)
        ds-labelled (-> ds-subset-with-embeddings
                        (tc/map-columns :label [:question] #(add-label % matches))
                        (tc/select-columns [:question :label :topic :embedding]))]
    (-> ds-labelled
        (tc/rows :as-maps)
        (into [{:question  question
                :label     "Question"
                :embedding q-embedding}])
        (tc/dataset))))
(defn vis-question-retrieval [question opts]
  (let [labels (question-retrieval-visulation-data question)
        data (-> (make-t-sne-coords labels opts)
                 (tc/dataset)
                 (tc/add-column :label (:label labels))
                 (tc/add-column :question (:question labels))
                 (tc/add-column :topic (:topic labels)))
        filter-label (fn [ds label]
                       (-> ds
                           (tc/select-rows #(= (% :label) label))
                           (tc/select-columns [:x :y])
                           (tc/rows :as-vectors)))]
    (kind/echarts
     {:xAxis {}
      :yAxis {}
      :series [{:data (filter-label data "Default")
                :name "Default"
                :type "scatter"
                :itemStyle {:color "#d3d3d3"}}
               {:data (filter-label data "Question")
                :name "Question"
                :symbolSize 20
                :type "scatter"}
               {:data (filter-label data "Match")
                :name "Match"
                :symbolSize 15
                :type "scatter"}]})))
(vis-question-retrieval sample-question {:perplexity 10})
(comment


  (vis-question-retrieval "A question asked about housing"
                          {:perplexity 10
                           :iterations 1000}))

3.2.2.2 3D Projection

(let [ds (question-retrieval-visulation-data sample-question)]
  (plot-t-sne-coords ds
                     (:label ds)
                     {:perplexity 10 :dimensions 3}
                     {:=color :label :=coordinates :3d
                      :=mark-opacity 0.5}))

3.2.2.3 Visualising all similarities

All matches ranked

(defn add-similarity-ranking-labels [question]
  (let [q-embedding (->> (text->embedding question) (.content))
        matches     (. questions-subset-store findRelevant q-embedding 1000)
        matches-ds  (-> (concat
                         [{:question   question
                           :similarity 1}]
                         (map (fn [entry]
                                {:question   (.text (.embedded entry))
                                 :similarity (.score entry)})
                              matches))
                        (tc/dataset)
                        (tc/add-column :idx (range)))
        lookup-idx  (fn [question]
                      (-> matches-ds (tc/select-rows #(= (:question %) question))
                          :idx first))]
    (-> (concat [{:question  question
                  :topic     "User"
                  :embedding (-> q-embedding (.vector) vec)}]
                (tc/rows ds-subset-with-embeddings :as-maps))
        (tc/dataset)
        (tc/map-columns :ranking [:question] lookup-idx)
        (tc/map-columns :ranking-label [:ranking]
                        (fn [r]
                          (if (= r 0) "Question"
                              (condp > r
                                11  "1-10"
                                101 "11-100"
                                501 "101-500"
                                "501+")))))))
(defn vis-similarity-rankings [question opts]
  (let [rankings-ds  (add-similarity-ranking-labels question)
        data         (-> (make-t-sne-coords rankings-ds opts)
                         (tc/dataset)
                         (tc/add-column :label (:ranking-label rankings-ds)))
        filter-label (fn [ds label]
                       (-> ds
                           (tc/select-rows #(= (% :label) label))
                           (tc/select-columns [:x :y])
                           (tc/rows :as-vectors)))]
    (kind/echarts
     {:tooltip {}
      :xAxis   {}
      :yAxis   {}
      :series  [{:data      (filter-label data "501+")
                 :name      "501+"
                 :type      "scatter"
                 :itemStyle {:color "#C5E8B7"}}
                {:data      (filter-label data "101-500")
                 :name      "101-500"
                 :type      "scatter"
                 :itemStyle {:color "#83D475"}}
                {:data      (filter-label data "11-100")
                 :name      "11-100"
                 :itemStyle {:color "#2EB62C"}
                 :type      "scatter"}
                {:data       (filter-label data "1-10")
                 :name       "11-100"
                 :symbolSize 15
                 :type       "scatter"}
                {:data       (filter-label data "Question")
                 :name       "Question"
                 :itemStyle  {:color "#DA0017"}
                 :symbolSize 15
                 :type       "scatter"}]})))
(vis-similarity-rankings "What is the government doing to improve local housing?"
                         {:perplexity 50})

3d

(defn vis-similarity-rankings-3d [question opts]
  (let [rankings-ds (add-similarity-ranking-labels question)]
    (plot-t-sne-coords
     rankings-ds (:ranking-label rankings-ds)
     {:perplexity 10 :dimensions 3}
     {:=color :label :=coordinates :3d})))
(vis-similarity-rankings-3d "Climate change" {})
source: src/notebooks/vector_database.clj