Fixes for Bruces comments

Signed-off-by: Matt Williams <m@technovangelist.com>
update as per jmorganca comments
2023-11-06 14:16:24 -08:00 · 2023-11-06 08:51:15 -08:00 · 2023-11-05 16:13:48 -08:00 · 2023-11-05 15:56:00 -08:00 · 2023-11-05 15:53:24 -08:00
8 changed files with 2193 additions and 0 deletions
--- a/examples/langchain-typescript-selfqueryingretrieval/.gitignore
+++ b/examples/langchain-typescript-selfqueryingretrieval/.gitignore
@ -0,0 +1,2 @@
+node_modules
+artcollection
--- a/examples/langchain-typescript-selfqueryingretrieval/findArt.ts
+++ b/examples/langchain-typescript-selfqueryingretrieval/findArt.ts
@ -0,0 +1,73 @@
+import { Chroma } from "langchain/vectorstores/chroma";
+import { ChromaTranslator } from "langchain/retrievers/self_query/chroma";
+import { Ollama } from "langchain/llms/ollama"
+import { AttributeInfo } from "langchain/schema/query_constructor";
+import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers";
+import { SelfQueryRetriever } from "langchain/retrievers/self_query";
+
+const modelName = "codellama";
+
+// Define the attributes of the schema so that the model will know what to look for
+const attributeInfo: AttributeInfo[] = [
+  {
+    name: "title",
+    type: "string",
+    description: "The title of the painting"
+  },
+  {
+    name: "date",
+    type: "integer",
+    description: "The four digit year when the painting was created"
+  },
+  {
+    name: "artistName",
+    type: "string",
+    description: "The first name and last name of the artist who created the painting. Always use the full name in the filter, even if it isn't included. If the query is 'van Gogh', the filter should be 'Vincent van Gogh'. Use Pierre-Auguste Renoir instead of just Renoir."
+  }
+]
+
+
+// Define the model used to generate embeddings, these capture the context of the input data
+const embeddings = new HuggingFaceTransformersEmbeddings({
+  modelName: "Xenova/all-MiniLM-L6-v2",
+});
+
+// Run the model using Ollama
+const llm = new Ollama({
+  model: modelName
+})
+
+const documentContents = "Description of the art";
+
+const findArt = async () => {
+  // Load the saved vector store
+  const vectorStore = await Chroma.fromExistingCollection(embeddings, {
+    collectionName: "artcollection",
+  });
+
+  const retriever = SelfQueryRetriever.fromLLM({
+    llm, vectorStore, documentContents, attributeInfo, verbose: false, useOriginalQuery: true, structuredQueryTranslator: new ChromaTranslator()
+  });
+
+  // Get the query from the command line
+  const query = process.argv[2];
+
+  try {
+    const newquery = await retriever.getRelevantDocuments(query, [
+      // You can add callbacks to the retriever to get information about the process. In this case, show the output 
+      // query from the LLM used to retrieve the documents
+      {
+        handleLLMEnd(output) {
+          console.log("This is the output from the LLM after it has come up with a filter")
+          const llmEndOutput = output.generations[0][0].text.replace(/\\"/gm, "'").replace(/\n/gm, "")
+          console.log(`output - ${JSON.stringify(llmEndOutput, null, 2)}`)
+        }
+      },
+    ]);
+    console.log(newquery);
+  } catch (error) {
+    console.log(`There was an error getting the values: ${error}`);
+  }
+}
+
+findArt();
--- a/examples/langchain-typescript-selfqueryingretrieval/generateSource.ts
+++ b/examples/langchain-typescript-selfqueryingretrieval/generateSource.ts
@ -0,0 +1,128 @@
+import { Artwork, RawArtwork } from './types';
+import { HuggingFaceTransformersEmbeddings } from 'langchain/embeddings/hf_transformers';
+import { Chroma } from "langchain/vectorstores/chroma";
+import { Document } from "langchain/document";
+import { ChromaClient } from "chromadb";
+const numberOfArtworks = 10;
+
+// list of artists we are going to pull from the API
+const artists = ["van Gogh", "Renoir", "Monet", "Picasso"]
+
+const generateSource = async () => {
+  // Delete the existing vector store so that we don't get duplicate documents
+  await new ChromaClient().deleteCollection({
+    name: "artcollection",
+  });
+  const allartworkdocs = await getArt(artists);
+
+  // Create the vector store
+  const vectorStore = await Chroma.fromDocuments(allartworkdocs, embedding, { collectionName: "artcollection" });
+  console.log(`Created vector store with ${await vectorStore.collection?.count()} documents`);
+}
+
+const getArt = async (artists: string[]) => {
+  const artworks: Artwork[] = [];
+  const artistsWorkIds: number[] = []
+
+  for (const artist of artists) {
+    // First get the ids of the works by each artist
+    const thisIds = await fetchArtistWorkIds(artist);
+    console.log(`Fetching ${artist}`);
+    await (new Promise(r => setTimeout(r, 1000)));
+    artistsWorkIds.push(...thisIds);
+  };
+  // now get the actual artwork
+  const artwork = await fetchArtwork(artistsWorkIds);
+  return artwork
+}
+
+const fetchArtistWorkIds = async (artist: string): Promise<number[]> => {
+  const artistURL = `https://api.artic.edu/api/v1/artworks/search?q=${artist}&limit=${numberOfArtworks}`;
+  const response = await fetch(artistURL);
+  const json = await response.json();
+  const artistWorks: { id: number }[] = json.data;
+  return artistWorks.map((work) => work.id);
+}
+const embedding = new HuggingFaceTransformersEmbeddings({
+  modelName: "Xenova/all-MiniLM-L6-v2",
+});
+
+//Turns out there are some weird characters in the descriptions
+const sanitize = (badstring: string): string => {
+  let goodstring = " ";
+  if (badstring !== null) {
+    goodstring = badstring
+      .replace(/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]>/gm, "")
+      .replace(/<\/a>/gm, "")
+      .replace(/<\/?em>/gm, "")
+      .replace(/[\u2018\u2019]/gm, "")
+      .replace(/[\u201C\u201D]/gm, "")
+      .replace(/[\u2013\u2014]/gm, "-")
+      .replace(/[\u2026]/gm, "...")
+      .replace(/[\u00A0]/gm, " ")
+      .replace(/[\u00AD]/gm, "-")
+      .replace(/[\u00B0]/gm, " degrees ")
+      .replace(/[\u00B1]/gm, " plus or minus ")
+      .replace(/[\u00B2]/gm, " squared ")
+      .replace(/[\u00B3]/gm, " cubed ")
+      .replace(/[\u00B4]/gm, "'")
+      .replace(/[\u00B5]/gm, " micro ")
+      .replace(/[\u00B6]/gm, " paragraph ")
+      .replace(/[\u00B7]/gm, " dot ")
+      .replace(/[\u00B8]/gm, ",")
+      .replace(/[\u00B9]/gm, " first ")
+      .replace(/[\u00BA]/gm, " degrees ")
+      .replace(/[\u00BB]/gm, ">>")
+      .replace(/[\u00BC]/gm, " 1/4 ")
+      .replace(/[\u00BD]/gm, " 1/2 ")
+      .replace(/[\uFB01]/gm, "fi")
+      .replace(/[\uFB02]/gm, "fl")
+      .replace(/[\uFB03]/gm, "ffi")
+      .replace(/[\uFB04]/gm, "ffl")
+      .replace(/[\uFB05]/gm, "ft")
+      .replace(/[\uFB06\uFB07\uFB08]/gm, "st")
+      .replace(/[\u00D7]/gm, "x")
+      .replace(/[\u00E8\u00E9]/gm, "e")
+      .replace(/[\u00F1]/gm, "n")
+      .replace(/[\u00F6]/gm, "o")
+      .replace(/[\u00F8]/gm, "o")
+      .replace(/[\u00FC]/gm, "u")
+      .replace(/[\u00FF]/gm, "y")
+      .replace(/[\u0101\u0103\u00E0]/gm, "a")
+      .replace(/[\u00C9]/gm, "E")
+      .replace(/<p>/gm, "")
+      .replace(/<\/p>/gm, "")
+      .replace(/\n/gm, "");
+  };
+  return goodstring;
+}
+
+const fetchArtwork = async (workids: number[]) => {
+  const docsarray = [];
+  const artworks: Artwork[] = [];
+
+  for await (const workid of workids) {
+    const artworkURL = `https://api.artic.edu/api/v1/artworks/${workid}`;
+    const response = await fetch(artworkURL);
+    const json = await response.json();
+    const artworkraw: RawArtwork = await json.data as RawArtwork;
+    const description = sanitize(artworkraw.description)
+    if (description !== " ") {
+      const doc = new Document({
+        pageContent: description,
+        metadata: {
+          title: sanitize(artworkraw.title),
+          date: artworkraw.date_end,
+          artistName:  artworkraw.artist_title,
+        }
+      });
+      docsarray.push(doc);
+      console.log("------------------")
+      console.log(`${artworkraw.title} - ${artworkraw.artist_title}`);
+    }
+  }
+
+  return docsarray;
+}
+
+generateSource();
--- a/examples/langchain-typescript-selfqueryingretrieval/package-lock.json
+++ b/examples/langchain-typescript-selfqueryingretrieval/package-lock.json
--- a/examples/langchain-typescript-selfqueryingretrieval/package.json
+++ b/examples/langchain-typescript-selfqueryingretrieval/package.json
@ -0,0 +1,20 @@
+{
+  "name": "typescript-selfqueryingretreival",
+  "version": "1.0.0",
+  "description": "",
+  "main": "index.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [],
+  "author": "",
+  "license": "ISC",
+  "dependencies": {
+    "@xenova/transformers": "^2.7.0",
+    "chromadb": "^1.5.11",
+    "langchain": "^0.0.177",
+    "ollama-node": "^0.1.24",
+    "peggy": "^3.0.2",
+    "sharp": "^0.32.6"
+  }
+}
--- a/examples/langchain-typescript-selfqueryingretrieval/readme.md
+++ b/examples/langchain-typescript-selfqueryingretrieval/readme.md
@ -0,0 +1,111 @@
+# Self Query Retrieval
+
+Filtering your vector database results to get better answers from your LLM.
+
+![sqr 2023-11-05 14_30_50](https://github.com/jmorganca/ollama/assets/633681/55afb7f5-ebd8-4c58-86ba-284594fd1ec8)
+
+## TLDR
+
+1. Install and run ChromaDB
+   1. Run `git clone https://github.com/chroma-core/chroma.git`
+   2. `cd chroma`
+   3. `docker-compose up -d --build`
+2. Navigate to this example's directory
+3. `npm install`
+4. `tsx ./GenerateSource.ts`
+5. `tsx ./FindArt.ts "are there any paintings from the artist Pablo Picasso"`
+
+Other questions to try:
+
+- Are there any paintings painted in 1881
+- Are there any paintings painted by Vincent van Gogh
+
+Note: If you haven't used `tsx`, it's a more modern alternate to `ts-node` and works especially well when you have libraries that use different module types. You can find it at [https://github.com/esbuild-kit/tsx](https://github.com/esbuild-kit/tsx).
+
+## Introduction
+
+Retrieval Augmented Generation (RAG) is what developers usually reach for when they want to ask questions to all of their notes. But often it doesn't give the results you need. And that's because there is still too much information. And frequently it's the wrong information. When you ask a question, RAG will retrieve a set of documents that it thinks are relevant to the question and then hand them off to the LLM. If you ask "what is a transformer", it may grab excerpts from the Transformers paper you read recently, along with sections of your Intro to Electronics book. Even if you ask a better question, such as "what is a transformer in the context of electrical engineering", it may still grab excerpts from the Transformers paper. And that's because the Transformers paper is a very good match for the question. It's just not the right match.
+
+Ideally, the Transformers paper and the Electronics book would be added to the database with some metadata, such as the topics or keywords. But RAG typically doesn't look at those metadata fields. And that's where Self Query Retrieval comes in. It's a way to use traditional database queries to narrow down the set of documents that RAG will use and thus get better results.
+
+## How it works
+
+There are a few things you need to do to enable Self Query Retrieval. First, there needs to be additional metadata about your content in the database. The examples in the Langchain documentation are based on movies, and the metadata includes the year, the director's name, the genre, etc. And then you need to pass the schema to the query to help it get the right documents.
+
+## The code
+
+There are two main parts to the code. First there is a `GenerateSource.ts` file and then there is a `FindArt.ts` file. Let's look at GenerateSource first.
+
+### GenerateSource
+
+The purpose of Generate Source is to create our data source. For this example, we are using the [Chicago Institute of Art API,](https://api.artic.edu/docs/#introduction) which is incredible. This will be loaded into a vector database, which for this example is ChromaDB.
+
+This could be any CSV file or other data source you have access to. The file would have a single descriptive column and then metadata columns. All the relevant columns from our dataset are being added to a Document object. Then that array of Documents is being loaded into ChromaDB. Finally, at the end, I verify that documents were created by outputting a count to the screen.
+
+```typescript
+  await new ChromaClient().deleteCollection({
+    name: "artcollection",
+  });
+
+  const vectorStore = await Chroma.fromDocuments(allartworkdocs, 
+        embedding, { collectionName: "artcollection" });
+  console.log(`Created vector store with 
+        ${await vectorStore.collection?.count()} documents`);
+```
+
+### FindArt
+
+To actually find the art, we need to start by loading the database:
+
+```typescript
+  const vectorStore = await Chroma.fromExistingCollection(embeddings, {
+    collectionName: "artcollection",
+  });
+```
+
+Now we can create our Self Query Retriever. This needs to be created referring to the LLM, the database, the description of the document and the description of all the attributes in the metadata, and finally a structured query translator which will take the query generated by the LLM and turn it into something useable by the database.
+
+```typescript
+const llm = new Ollama({
+  model: modelName
+})
+const documentContents = "Description of the art";
+
+const attributeInfo: AttributeInfo[] = [
+  {
+    name: "title",
+    type: "string",
+    description: "The title of the painting"
+  },
+  {
+    name: "date",
+    type: "integer",
+    description: "The four digit year when the painting was created"
+  },
+  {
+    name: "artistName",
+    type: "string",
+    description: "The first name and last name of the artist who created the painting. Always use the full name in the filter, even if it isn't included. If the query is 'van Gogh', the filter should be 'Vincent van Gogh'. Use Pierre-Auguste Renoir instead of just Renoir."
+  }
+]
+
+const retriever = SelfQueryRetriever.fromLLM({
+  llm, vectorStore, documentContents, attributeInfo, verbose: false, useOriginalQuery: true, structuredQueryTranslator: new ChromaTranslator()
+});
+```
+
+Now we can ask a question and get the results:
+
+```typescript
+const newquery = await retriever.getRelevantDocuments(query)
+```
+
+## Next Steps
+
+When you run this example, you will get a set of documents from the database that may be a bit more relevant to your question. Now you could feed those to the LLM and get the actual answer to the question based on these documents.
+
+To take this further, you could work on getting more out of the dataset. It turns out that this works best if there is only a single possible value for any given field. Our artists are often referred to by their last name, but sometimes using their full name. It may be Vincent van Gogh, or just van Gogh. Another way to get around this is to build a better query translator that knows that the search could be for a substring of the full name. But that also requires looking into the metadata searching capabilities of the database.
+
+Maybe it makes more sense to move the artist name and title of the work into the document itself. Then add some more metadata (there are at least 100 other attributes in the raw API that aren't used in this example.)
+
+Also try different models. In testing so far, it seems that `codellama` produces more reliably useable filters. It's not perfect and can still create a filter that won't find anything. When a new code model comes out, you might try that to see if it performs better.
--- a/examples/langchain-typescript-selfqueryingretrieval/tsconfig.json
+++ b/examples/langchain-typescript-selfqueryingretrieval/tsconfig.json
@ -0,0 +1,10 @@
+{
+  "compilerOptions": {
+    "target": "es2016",                                  
+    "module": "commonjs",                                /* Specify what module code is generated. */
+    "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
+    "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */
+    "strict": true,                                      /* Enable all strict type-checking options. */
+    "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
+  }
+}
--- a/examples/langchain-typescript-selfqueryingretrieval/types.ts
+++ b/examples/langchain-typescript-selfqueryingretrieval/types.ts
@ -0,0 +1,26 @@
+export type RawArtwork = {
+  id: number;
+  title: string;
+  artist_display: string;
+  place_of_origin: string;
+  date_start: number;
+  date_end: number;
+  duration: number;
+  dimensions: string;
+  medium_display: string;
+  credit_line: string;
+  artwork_type_title: string;
+  department_title: string;
+  artist_title: string;
+  classification_title: string;
+  description: string;
+}
+
+export type Artwork = {
+  id: number;
+  title: string;
+  country: string;
+  date: number;
+  artist: string;
+  description: string;
+}
Author	SHA1	Message	Date
Matt Williams	9dd88dc040	Fixes for Bruces comments Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-06 14:16:24 -08:00
Matt Williams	3d8872bbbd	update as per jmorganca comments Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-06 08:51:15 -08:00
Matt Williams	a1c8974975	also try other models Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-05 16:13:48 -08:00
Matt Williams	1aaaaa76a0	add examples Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-05 15:56:00 -08:00
Matt Williams	9411399cb4	Add new example for self querying retrieval Signed-off-by: Matt Williams <m@technovangelist.com>	2023-11-05 15:53:24 -08:00