Add new example for self querying retrieval

Signed-off-by: Matt Williams <m@technovangelist.com>
This commit is contained in:
Matt Williams 2023-11-05 15:53:24 -08:00
parent ad88799411
commit 9411399cb4
7 changed files with 2187 additions and 0 deletions

View File

@ -0,0 +1,2 @@
node_modules
artcollection

View File

@ -0,0 +1,72 @@
import { Chroma } from "langchain/vectorstores/chroma";
import { ChromaTranslator } from "langchain/retrievers/self_query/chroma";
import { Ollama } from "langchain/llms/ollama"
import { AttributeInfo } from "langchain/schema/query_constructor";
import { HuggingFaceTransformersEmbeddings } from "langchain/embeddings/hf_transformers";
import { SelfQueryRetriever } from "langchain/retrievers/self_query";
const modelName = "codellama";
// Define the attributes of the schema so that the model will know what to look for
const attributeInfo: AttributeInfo[] = [
{
name: "title",
type: "string",
description: "The title of the painting"
},
{
name: "date",
type: "integer",
description: "The four digit year when the painting was created"
},
{
name: "artistName",
type: "string",
description: "The first name and last name of the artist who created the painting. Always use the full name in the filter, even if it isn't included. If the query is 'van Gogh', the filter should be 'Vincent van Gogh'. Use Pierre-Auguste Renoir instead of just Renoir."
}
]
// Define the embeddings that will be used when adding the documents to the vector store
const embeddings = new HuggingFaceTransformersEmbeddings({
modelName: "Xenova/all-MiniLM-L6-v2",
});
// Create the Ollama model
const llm = new Ollama({
model: modelName
})
const documentContents = "Description of the art";
const findArt = async () => {
// Load the saved vector store
const vectorStore = await Chroma.fromExistingCollection(embeddings, {
collectionName: "artcollection",
});
const retriever = SelfQueryRetriever.fromLLM({
llm, vectorStore, documentContents, attributeInfo, verbose: false, useOriginalQuery: true, structuredQueryTranslator: new ChromaTranslator()
});
// Get the query from the command line
const query = process.argv[2];
try {
const newquery = await retriever.getRelevantDocuments(query, [
// You can add callbacks to the retriever to get information about the process. In this case, show the output
// query from the LLM used to retrieve the documents
{
handleLLMEnd(output) {
console.log("llm end")
const outout = output.generations[0][0].text.replace(/\\"/gm, "'").replace(/\n/gm, "")
console.log(`output - ${JSON.stringify(outout, null, 2)}`)
}
},
]);
console.log(newquery);
} catch (error) {
console.log(`There was an error getting the values: ${error}`);
}
}
findArt();

View File

@ -0,0 +1,134 @@
import { Artwork, RawArtwork } from './types';
import { HuggingFaceTransformersEmbeddings } from 'langchain/embeddings/hf_transformers';
import { Chroma } from "langchain/vectorstores/chroma";
import { Document } from "langchain/document";
import { ChromaClient } from "chromadb";
const numberOfArtworks = 15;
// list of artists we are going to pull from the API
const artists = ["van Gogh", "Renoir", "Monet", "Picasso"]
const generateSource = async () => {
// Delete the existing vector store so that we don't get duplicate documents
await new ChromaClient().deleteCollection({
name: "artcollection",
});
const allartworkdocs = await getArt(artists);
// Create the vector store
const vectorStore = await Chroma.fromDocuments(allartworkdocs, embedding, { collectionName: "artcollection" });
console.log(`Created vector store with ${await vectorStore.collection?.count()} documents`);
}
const getArt = async (artists: string[]) => {
const artworks: Artwork[] = [];
const artistsWorkIds: number[] = []
for (const artist of artists) {
// First get the ids of the works by each artist
const thisIds = await fetchArtistWorkIds(artist);
console.log(`Fetching ${artist}`);
await (new Promise(r => setTimeout(r, 1000)));
artistsWorkIds.push(...thisIds);
};
// now get the actual artwork
const artwork = await fetchArtwork(artistsWorkIds);
return artwork
}
const fetchArtistWorkIds = async (artist: string): Promise<number[]> => {
const artistURL = `https://api.artic.edu/api/v1/artworks/search?q=${artist}&limit=${numberOfArtworks}`;
const response = await fetch(artistURL);
const json = await response.json();
const artistWorks: { id: number }[] = json.data;
const justIds = artistWorks.map((work) => work.id);
return justIds;
}
const embedding = new HuggingFaceTransformersEmbeddings({
modelName: "Xenova/all-MiniLM-L6-v2",
});
//Turns out there are some weird characters in the descriptions
const sanitize = (badstring: string): string => {
let goodstring = " ";
if (badstring !== null) {
goodstring = badstring
.replace(/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]>/gm, "")
.replace(/<\/a>/gm, "")
.replace(/<\/?em>/gm, "")
.replace(/[\u2018\u2019]/gm, "")
.replace(/[\u201C\u201D]/gm, "")
.replace(/[\u2013\u2014]/gm, "-")
.replace(/[\u2026]/gm, "...")
.replace(/[\u00A0]/gm, " ")
.replace(/[\u00AD]/gm, "-")
.replace(/[\u00B0]/gm, " degrees ")
.replace(/[\u00B1]/gm, " plus or minus ")
.replace(/[\u00B2]/gm, " squared ")
.replace(/[\u00B3]/gm, " cubed ")
.replace(/[\u00B4]/gm, "'")
.replace(/[\u00B5]/gm, " micro ")
.replace(/[\u00B6]/gm, " paragraph ")
.replace(/[\u00B7]/gm, " dot ")
.replace(/[\u00B8]/gm, ",")
.replace(/[\u00B9]/gm, " first ")
.replace(/[\u00BA]/gm, " degrees ")
.replace(/[\u00BB]/gm, ">>")
.replace(/[\u00BC]/gm, " 1/4 ")
.replace(/[\u00BD]/gm, " 1/2 ")
.replace(/[\uFB01]/gm, "fi")
.replace(/[\uFB02]/gm, "fl")
.replace(/[\uFB03]/gm, "ffi")
.replace(/[\uFB04]/gm, "ffl")
.replace(/[\uFB05]/gm, "ft")
.replace(/[\uFB06\uFB07\uFB08]/gm, "st")
.replace(/[\u00D7]/gm, "x")
.replace(/[\u00E8\u00E9]/gm, "e")
.replace(/[\u00F1]/gm, "n")
.replace(/[\u00F6]/gm, "o")
.replace(/[\u00F8]/gm, "o")
.replace(/[\u00FC]/gm, "u")
.replace(/[\u00FF]/gm, "y")
.replace(/[\u0101\u0103\u00E0]/gm, "a")
.replace(/[\u00C9]/gm, "E")
.replace(/<p>/gm, "")
.replace(/<\/p>/gm, "")
.replace(/\n/gm, "");
};
return goodstring;
}
const fetchArtwork = async (workids: number[]) => {
const docsarray = [];
const artworks: Artwork[] = [];
for await (const workid of workids) {
const artworkURL = `https://api.artic.edu/api/v1/artworks/${workid}`;
const response = await fetch(artworkURL);
const json = await response.json();
const artworkraw: RawArtwork = await json.data as RawArtwork;
const description = sanitize(artworkraw.description)
if (description !== " ") {
const doc = new Document({
pageContent: description,
metadata: {
title: sanitize(artworkraw.title),
date: artworkraw.date_end,
artistName: artworkraw.artist_title,
}
});
docsarray.push(doc);
console.log("------------------")
console.log(`${artworkraw.title} - ${artworkraw.artist_title}`);
}
}
return docsarray;
}
generateSource();

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
{
"name": "typescript-selfqueryingretreival",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"@xenova/transformers": "^2.7.0",
"chromadb": "^1.5.11",
"langchain": "^0.0.177",
"ollama-node": "^0.1.24",
"peggy": "^3.0.2",
"sharp": "^0.32.6"
}
}

View File

@ -0,0 +1,109 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
// "rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
// "outDir": "./", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "importsNotUsedAsValues": "remove", /* Specify emit/checking behavior for imports that are only used for types. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
// "preserveValueImports": true, /* Preserve unused imported values in the JavaScript output that would otherwise be removed. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": true, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
}
}

View File

@ -0,0 +1,27 @@
export type RawArtwork = {
id: number;
title: string;
artist_display: string;
place_of_origin: string;
date_start: number;
date_end: number;
duration: number;
dimensions: string;
medium_display: string;
credit_line: string;
artwork_type_title: string;
department_title: string;
artist_title: string;
classification_title: string;
description: string;
}
export type Artwork = {
id: number;
title: string;
country: string;
date: number;
artist: string;
description: string;
// description_embedding: number[];
}