Here two vector databases are createdΒΆ

One using the fine tuned BERT with cls pooling

One using the pretrained BERT with cls pooling

InΒ [30]:
# Imports

from lxml import etree

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from sentence_transformers import SentenceTransformer, models
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Annoy, FAISS
from typing import List
from langchain.vectorstores.base import VectorStore


import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from langchain.llms import GPT4All
from langchain.chains import RetrievalQA

# Namespace Spec (for lxml)
NS = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0',
      'xhtml': 'http://www.w3.org/1999/xhtml'}

Parse tax law

InΒ [Β ]:
def get_ancestor_heading_text(section, tag, ns):
    ancestor = section.getparent()
    while ancestor is not None:
        if ancestor.tag == f"{{{ns['uslm']}}}{tag}":
            heading = ancestor.find('uslm:heading', namespaces=ns)
            return heading.text.strip() if heading is not None else ""
        ancestor = ancestor.getparent()
    return ""

def parse_sections_with_metadata(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        # Get all paragraphs (and any nested elements)
        content_texts = []
        for p in section.findall('.//uslm:p', namespaces=NS):
            text = ' '.join(p.itertext()).strip()
            if text:
                content_texts.append(text)

        if len(content_texts) == 0:
            continue

        # Get ancestors: subtitle, chapter, part
        subtitle = get_ancestor_heading_text(section, 'subtitle', NS)
        chapter = get_ancestor_heading_text(section, 'chapter', NS)
        part = get_ancestor_heading_text(section, 'part', NS)

        parsed.append({
            "metadata": {
                "subtitle": subtitle,
                "chapter": chapter,
                "part": part
                },
            "content": "\n".join(content_texts)
        })

    return parsed

data_dict = parse_sections_with_metadata("/DL-data/usc26.xml")
data_dict

Ingest & chunk data

InΒ [4]:
# Wrap in LangChain Document objects
documents = [
    Document(page_content=d["content"], metadata=d["metadata"])
    for d in data_dict
]

# Split each document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
chunked_docs = text_splitter.split_documents(documents)

Embed and Store

InΒ [5]:
# Init the embedding model -- fine tuned 
ft_embedding_model = HuggingFaceEmbeddings(model_name="chandlerNick/sentence-transformers-usc26-bert")

# BERT BASE
bert = models.Transformer('bert-base-uncased')

pooling = models.Pooling(
    word_embedding_dimension=bert.get_word_embedding_dimension(),
    pooling_mode_cls_token=True,
    pooling_mode_mean_tokens=False,
    pooling_mode_max_tokens=False,
)

cls_model = SentenceTransformer(modules=[bert, pooling])
cls_model.save("custom-bert-cls")

pt_embedding_model = HuggingFaceEmbeddings(model_name="./custom-bert-cls")

FT Vector Store

InΒ [Β ]:
# Create Annoy vector store from chunked docs
ft_vector_store = Annoy.from_documents(chunked_docs, ft_embedding_model)
ft_vector_store.save_local("ft_annoy_tax_code_index")
InΒ [11]:
# Create FAISS vector store
ft_vector_store_2 = FAISS.from_documents(chunked_docs, ft_embedding_model)
ft_vector_store_2.save_local("faiss_ft_store")

PT Vector Store

InΒ [Β ]:
# Create Annoy vector store from chunked docs
pt_vector_store = Annoy.from_documents(chunked_docs, pt_embedding_model)
pt_vector_store.save_local("pt_annoy_tax_code_index")
InΒ [12]:
# Create FAISS vector store
pt_vector_store_2 = FAISS.from_documents(chunked_docs, pt_embedding_model)
pt_vector_store_2.save_local("faiss_pt_store")

Qualitative Comparison of Vector Stores

InΒ [31]:
def inspect_query_results(
    query: str,
    vector_stores: List[VectorStore],
    labels: List[str],
    k: int = 5,
    metadata_key: str = "subtitle",
    print_content_preview_chars: int = 200,
):
    """
    Inspect the top-k results of a query across multiple vector stores.

    Args:
        query (str): The input query string.
        vector_stores (List[VectorStore]): List of LangChain vector store objects.
        labels (List[str]): List of string labels for each vector store (e.g. "FT BERT + Annoy").
        k (int): Number of top documents to retrieve per vector store.
        metadata_key (str): Key in metadata to show (e.g., "subtitle").
        print_content_preview_chars (int): Number of characters from page content to preview.

    Returns:
        None
    """
    print(f"\nπŸ” Query: \"{query}\"")
    print("-" * 80)

    for store, label in zip(vector_stores, labels):
        print(f"\nπŸ“˜ {label}")
        print("=" * len(label))
        try:
            results = store.similarity_search(query, k=k)
        except:
            results = store.similarity_search_with_score(query, k=k)
            results = [res[0] for res in results]  # Strip scores if needed

        for idx, doc in enumerate(results):
            subtitle = doc.metadata.get(metadata_key, "Unknown")
            content_preview = doc.page_content[:print_content_preview_chars].replace("\n", " ")
            print(f"{idx+1}. [{subtitle}] {content_preview}...")

        print("-" * 80)
InΒ [34]:
vector_stores = [ft_vector_store, pt_vector_store, ft_vector_store_2, pt_vector_store_2]
labels = [
    "FT BERT + Annoy",
    "CLS BERT + Annoy",
    "FT BERT + FAISS",
    "CLS BERT + FAISS"
]
queries = [
    "A tax is hereby imposed for each taxable year on the taxable income of every corporation",  # Directly copied text from document
    "For purposes of this part, an individual shall be treated as not married at the close of the taxable year if such individual is so treated under the provisions of section 7703(b)",  # Direct quoting
    "Tax on head of household",  # Near text from the documents
    "Tree huggers are cool!"  # A garbage query, unrelated to the tax law
]

for query in queries:
    inspect_query_results(
        query=query,
        vector_stores=vector_stores,
        labels=labels,
        k=5,
        metadata_key="subtitle"
    )
πŸ” Query: "A tax is hereby imposed for each taxable year on the taxable income of every corporation"
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + Annoy
===============
1. [Income Taxes] Subsec. (a)(5).  Pub. L. 92–603, Β§β€―135(a)(1)(B) , struck out par. (5) which provided that in the case of taxable years beginning after  Dec. 31, 2010 , the tax shall be equal to 7.0 percent of the amo...
2. [Income Taxes] In the case of a nonresident alien individual present in the United States for a period or periods aggregating 183 days or more during the taxable year, there is hereby imposed for such year a tax of ...
3. [Income Taxes] the United States as the amount upon which the existing 30 percent levy should be imposed, and added contingent income received from the sale of patents and other intangibles and amounts of original i...
4. [Income Taxes] β€œ(B)   Computation of tax .β€” If subparagraph (A) applies to any lump sum distribution of any taxpayer for any taxable year, the tax imposed by section 1 of the Internal Revenue Code of 1986 on such ta...
5. [Income Taxes] Subsec. (d)(3).  Pub. L. 105–206, Β§β€―6003(a)(1)(A) , (B), (2), redesignated par. (5) as (3), substituted β€œparagraph (1)” for β€œparagraph (3)” in introductory provisions, and struck out heading and text ...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + Annoy
================
1. [Income Taxes] exchange of property shall not be considered a payment, and any payment due under such evidence of indebtedness”....
2. [Income Taxes] for β€œuseful life of any property shall be determined as of the time such property is placed in service by the taxpayer”....
3. [Procedure and Administration] All persons having liens upon or claiming any interest in the property involved in such action shall be made parties thereto....
4. [Procedure and Administration] any tax imposed by this title which is required to be paid by means of a stamp shall be filed by the taxpayer within 3 years from the time the tax was paid....
5. [Employment Taxes] to such tax. No deduction shall be allowed under this title for any liability imposed by the preceding sentence....
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + FAISS
===============
1. [Income Taxes] A tax is hereby imposed for each taxable year on the taxable income of every corporation. The amount of the tax imposed by subsection (a) shall be 21 percent of taxable income. In the case of a foreig...
2. [Income Taxes] β€œ(B)   Taxation of exempt arbitrage profits.β€” β€œ(i)   In general .β€” In the case of an organization which elects the application of this subparagraph, there is hereby imposed a tax on the exempt arbitra...
3. [Income Taxes] β€œ(1)  Organizations taxable at corporate rates .β€”If an organization is subject to tax on unrelated business taxable income pursuant to subsection (a), the tax imposed by section 56 shall apply to such...
4. [Income Taxes] β€œ(2)  Organizations taxable as trusts .β€”If an organization is subject to tax on unrelated business taxable income pursuant to subsection (b), the taxes imposed by section 55 shall apply to such organi...
5. [Income Taxes] 1997β€”Subsec. (h).  Pub. L. 105–34  amended heading and text of subsec. (h) generally. Prior to amendment, text read as follows: β€œIf a taxpayer has a net capital gain for any taxable year, then the tax...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + FAISS
================
1. [Income Taxes] for β€œuseful life of any property shall be determined as of the time such property is placed in service by the taxpayer”....
2. [Income Taxes] exchange of property shall not be considered a payment, and any payment due under such evidence of indebtedness”....
3. [Employment Taxes] to such tax. No deduction shall be allowed under this title for any liability imposed by the preceding sentence....
4. [Procedure and Administration] any tax imposed by this title which is required to be paid by means of a stamp shall be filed by the taxpayer within 3 years from the time the tax was paid....
5. [Procedure and Administration] All persons having liens upon or claiming any interest in the property involved in such action shall be made parties thereto....
--------------------------------------------------------------------------------

πŸ” Query: "For purposes of this part, an individual shall be treated as not married at the close of the taxable year if such individual is so treated under the provisions of section 7703(b)"
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + Annoy
===============
1. [Income Taxes] β€œ(A) the taxpayer is allowed a deduction under section 151 with respect to such individual for the taxable year, β€œ(B) such individual has not attained the age of 17 as of the close of the calendar yea...
2. [Income Taxes] If the taxpayer is married at the close of the taxable year, the deduction shall be allowed under subsection (a) only if the taxpayer and the taxpayer’s spouse file a joint return for the taxable year...
3. [Income Taxes] 1978β€”Subsec. (f)(6).  Pub. L. 95–600  substituted provision disallowing a credit for any amount paid by a taxpayer to an individual with respect to whom, for the taxable year, a deduction under sectio...
4. [Income Taxes] the taxpayer to an individual bearing a relationship described in section 152(a)(1) through (8), or a dependent described in section 152(a)(9), except that a credit was allowed for an amount paid by a...
5. [Income Taxes] The term β€œqualifying child” means a qualifying child of the taxpayer (as defined in section 152(c), determined without regard to paragraph (1)(D) thereof and section 152(e)). The term β€œqualifying chil...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + Annoy
================
1. [Procedure and Administration] For purposes of this section, any failure to comply with section 6226(b)(4)(A)(ii) shall be treated as a failure to pay the amount described in subclause (II) thereof and such amount shall be treated ...
2. [Estate and Gift Taxes] For purposes of this subsection, an applicable family member shall be treated in the same manner as the transferor with respect to any distribution right retained by such family member to which subsec...
3. [Income Taxes] The amount of the credit allowed under this section to any taxpayer for any taxable year shall be reduced (but not below zero) by the aggregate amount of payments made under section 7527A to such taxp...
4. [Estate and Gift Taxes] For purposes of the second sentence of subsection (a) and the last sentence of subsection (b), an agreement which meets the requirements of section 6324A (relating to special lien for estate tax defer...
5. [Miscellaneous Excise Taxes] An individual for whose benefit an Archer MSA (within the meaning of section 220(d)) is established shall be exempt from the tax imposed by this section with respect to any transaction concerning such...
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + FAISS
===============
1. [Income Taxes] Subsec. (b)(3)(B)(ii).  Pub. L. 94–455, Β§β€―1901(b)(9) , redesignated cl. (iii) as (ii) and struck out former cl. (ii) which provided that an individual who was a dependent solely by reason of par. (10)...
2. [Income Taxes] β€œ(A) the taxpayer is allowed a deduction under section 151 with respect to such individual for the taxable year, β€œ(B) such individual has not attained the age of 17 as of the close of the calendar yea...
3. [Income Taxes] If the taxpayer is married at the close of the taxable year, the deduction shall be allowed under subsection (a) only if the taxpayer and the taxpayer’s spouse file a joint return for the taxable year...
4. [Income Taxes] 1978β€”Subsec. (f)(6).  Pub. L. 95–600  substituted provision disallowing a credit for any amount paid by a taxpayer to an individual with respect to whom, for the taxable year, a deduction under sectio...
5. [Income Taxes] the taxpayer to an individual bearing a relationship described in section 152(a)(1) through (8), or a dependent described in section 152(a)(9), except that a credit was allowed for an amount paid by a...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + FAISS
================
1. [Procedure and Administration] For purposes of this section, any failure to comply with section 6226(b)(4)(A)(ii) shall be treated as a failure to pay the amount described in subclause (II) thereof and such amount shall be treated ...
2. [Estate and Gift Taxes] For purposes of this subsection, an applicable family member shall be treated in the same manner as the transferor with respect to any distribution right retained by such family member to which subsec...
3. [Estate and Gift Taxes] For purposes of the second sentence of subsection (a) and the last sentence of subsection (b), an agreement which meets the requirements of section 6324A (relating to special lien for estate tax defer...
4. [Miscellaneous Excise Taxes] An individual for whose benefit an Archer MSA (within the meaning of section 220(d)) is established shall be exempt from the tax imposed by this section with respect to any transaction concerning such...
5. [Income Taxes] The amount of the credit allowed under this section to any taxpayer for any taxable year shall be reduced (but not below zero) by the aggregate amount of payments made under section 7527A to such taxp...
--------------------------------------------------------------------------------

πŸ” Query: "Tax on head of household"
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + Annoy
===============
1. [Employment Taxes] Subsec. (e)(3).  Pub. L. 98–76, Β§β€―225(c)(1)(C) , (6), substituted β€œtaxes imposed by section 3201” for β€œtax imposed by section 3201”, and β€œsuch taxes” for β€œsuch tax”. Subsec. (e)(4)(A).  Pub. L. 98–76,...
2. [Income Taxes] Subsec. (m)(2)(B).  Pub. L. 98–369, Β§β€―628(a)(2) , substituted β€œis exempt from tax under this title without regard to any provision of law which is not contained in this title and which is not containe...
3. [Income Taxes] For purposes of clause (ii)(II), any tax paid before  January 1, 1987 , pursuant to a process in effect before  August 16, 1986 , shall be treated as paid before the date of the enactment of this Act....
4. [Trust Fund Code] Pub. L. 105–34, title IX, Β§β€―901(f) ,  Aug. 5, 1997 ,  111 Stat. 872 , provided that:  β€œThe amendments made by this section [amending this section] shall apply to taxes received in the Treasury after  ...
5. [Trust Fund Code] Pub. L. 106–554, Β§β€―1(a)(7) [title III, Β§β€―318(e)(2)] ,  Dec. 21, 2000 ,  114 Stat. 2763 , 2763A–646, provided that:  β€œThe amendment made by paragraph (1) [amending this section] shall apply with respec...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + Annoy
================
1. [Procedure and Administration] is entitled to the benefits of section 7508 of the Internal Revenue Code of 1986:...
2. [Income Taxes] the regular tax liability attributable to income from such partnership....
3. [Income Taxes] allowed as a deduction under section 162(a) (relating to trade or business expenses).”...
4. [Income Taxes] the corporation’s taxable income and not properly chargeable to capital account)”....
5. [Income Taxes] payable to patrons which are derived from business done with or for patrons during the taxable year.”...
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + FAISS
===============
1. [Employment Taxes] Subsec. (e)(3).  Pub. L. 98–76, Β§β€―225(c)(1)(C) , (6), substituted β€œtaxes imposed by section 3201” for β€œtax imposed by section 3201”, and β€œsuch taxes” for β€œsuch tax”. Subsec. (e)(4)(A).  Pub. L. 98–76,...
2. [Income Taxes] Subsec. (m)(2)(B).  Pub. L. 98–369, Β§β€―628(a)(2) , substituted β€œis exempt from tax under this title without regard to any provision of law which is not contained in this title and which is not containe...
3. [Procedure and Administration] For purposes of subsections (a), (b), and (c), the taxes imposed by section 4041(d) shall be treated as imposed by section 4041(a)....
4. [Income Taxes] Subsec. (b).  Pub. L. 91–172  generally revised rates of tax of heads of household downwards and struck out provisions defining head of household, determination of status, and limitations. For definit...
5. [Miscellaneous Excise Taxes] Subsec. (b)(3).  Pub. L. 99–499, Β§β€―521(d)(1) , added par. (3). Subsecs. (d), (e).  Pub. L. 99–499, Β§β€―521(a)(2) , added subsec. (d) and redesignated former subsec. (d) as (e). Subsec. (f)(3).  Pub. L. ...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + FAISS
================
1. [Procedure and Administration] is entitled to the benefits of section 7508 of the Internal Revenue Code of 1986:...
2. [Income Taxes] the corporation’s taxable income and not properly chargeable to capital account)”....
3. [Income Taxes] the regular tax liability attributable to income from such partnership....
4. [Income Taxes] allowed as a deduction under section 162(a) (relating to trade or business expenses).”...
5. [Income Taxes] rules under section 1091 of the Internal Revenue Code of 1986 relating to losses from wash sales.”...
--------------------------------------------------------------------------------

πŸ” Query: "Tree huggers are cool!"
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + Annoy
===============
1. [Income Taxes] this part by  Pub. L. 99–514 . See  section 7703 of this title ....
2. [Alcohol, Tobacco, and Certain Other Excise Taxes] (a) 5215.   (b) 5305.   (c) 5215, 5306. The prior sections,  act Aug. 16, 1954, ch. 736 , are set out in  68A Stat. 640 , 657. 1976β€”Subsecs. (a) to (c).  Pub. L. 94–455  struck out β€œor his delegate” a...
3. [Income Taxes] provisions generally, prior to the general revision of this part by  Pub. L. 98–369, Β§β€―211(a) ....
4. [Income Taxes] 1996β€” Pub. L. 104–188  substituted β€œthis section” for β€œthis subsection”....
5. [Miscellaneous Excise Taxes] (within the meaning of section 503(b) of such Code) or the corresponding provisions of prior law;...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + Annoy
================
1. [Income Taxes] β€œ(B)  by substituting β€˜13 percent’ for β€˜20 percent’....
2. [Procedure and Administration] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
3. [Estate and Gift Taxes] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
4. [Employment Taxes] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
5. [Alcohol, Tobacco, and Certain Other Excise Taxes] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
--------------------------------------------------------------------------------

πŸ“˜ FT BERT + FAISS
===============
1. [Income Taxes] this part by  Pub. L. 99–514 . See  section 7703 of this title ....
2. [Employment Taxes] ]...
3. [Income Taxes] 1996β€” Pub. L. 104–188  substituted β€œthis section” for β€œthis subsection”....
4. [Alcohol, Tobacco, and Certain Other Excise Taxes] (a) 5215.   (b) 5305.   (c) 5215, 5306. The prior sections,  act Aug. 16, 1954, ch. 736 , are set out in  68A Stat. 640 , 657. 1976β€”Subsecs. (a) to (c).  Pub. L. 94–455  struck out β€œor his delegate” a...
5. [Income Taxes] without regard toβ€” β€œ(A)  paragraph (2) of such section 453(b), and...
--------------------------------------------------------------------------------

πŸ“˜ CLS BERT + FAISS
================
1. [Employment Taxes] after β€œsuch employer’s foreign”, and inserted β€œor residents” after β€œcitizens”....
2. [Income Taxes] β€œ(B)  by substituting β€˜13 percent’ for β€˜20 percent’....
3. [Procedure and Administration] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
4. [Estate and Gift Taxes] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
5. [Employment Taxes] 1976β€” Pub. L. 94–455  struck out β€œor his delegate” after β€œSecretary”....
--------------------------------------------------------------------------------

Comparison of Embedding Models' Embedding Spaces (Low dimensional projection)

InΒ [Β ]:
def plot_two_faiss_embeddings_pca_side_by_side(
    vector_store1: FAISS,
    vector_store2: FAISS,
    color_key="subtitle",
    title1="Vector Store 1",
    title2="Vector Store 2",
    figsize=(16, 7),
):
    """
    Plot PCA projections of embeddings from two LangChain FAISS vector stores side by side.

    Args:
        vector_store1 (FAISS): First FAISS vector store.
        vector_store2 (FAISS): Second FAISS vector store.
        color_key (str): Metadata key to color points by. Default is "subtitle".
        title1 (str): Title for the first subplot.
        title2 (str): Title for the second subplot.
        figsize (tuple): Figure size.

    Returns:
        fig (matplotlib.figure.Figure): The matplotlib figure object.
        axs (np.ndarray): Array of the two subplot axes.
    """
    def extract_embeddings_and_labels(vector_store):
        faiss_index = vector_store.index
        num_vectors = faiss_index.ntotal
        dim = faiss_index.d
        embeddings = np.zeros((num_vectors, dim), dtype="float32")
        faiss_index.reconstruct_n(0, num_vectors, embeddings)
        metadatas = [doc.metadata for doc in vector_store.docstore._dict.values()]
        labels = [meta.get(color_key, "Unknown") for meta in metadatas]
        return embeddings, labels

    embeddings1, labels1 = extract_embeddings_and_labels(vector_store1)
    embeddings2, labels2 = extract_embeddings_and_labels(vector_store2)

    # Combine unique labels from both sets for consistent coloring
    unique_labels = sorted(set(labels1) | set(labels2))
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}

    color_indices1 = [label_to_idx[label] for label in labels1]
    color_indices2 = [label_to_idx[label] for label in labels2]

    pca = PCA(n_components=2)
    # Fit PCA on combined data for consistent axes
    combined_embeddings = np.vstack([embeddings1, embeddings2])
    combined_projected = pca.fit_transform(combined_embeddings)

    projected1 = combined_projected[:len(embeddings1)]
    projected2 = combined_projected[len(embeddings1):]

    fig, axs = plt.subplots(1, 2, figsize=figsize)

    scatter1 = axs[0].scatter(projected1[:, 0], projected1[:, 1], c=color_indices1, cmap="tab10", alpha=0.7, s=5)
    axs[1].scatter(projected2[:, 0], projected2[:, 1], c=color_indices2, cmap="tab10", alpha=0.7, s=5)

    # Legend (one common legend on the right)
    handles, labels = scatter1.legend_elements()
    fig.legend(handles, unique_labels, title=color_key, loc='center right')

    axs[0].set_title(title1)
    axs[1].set_title(title2)

    var_pc1 = pca.explained_variance_ratio_[0] * 100
    var_pc2 = pca.explained_variance_ratio_[1] * 100

    for ax in axs:
        ax.set_xlabel(f"PC1 ({var_pc1:.1f}%)")
        ax.set_ylabel(f"PC2 ({var_pc2:.1f}%)")

    plt.tight_layout(rect=[0, 0, 0.75, 1])  # Leave room for legend on right

    return fig, axs

PCA

InΒ [24]:
fig, axs = plot_two_faiss_embeddings_pca_side_by_side(
    ft_vector_store_2,
    pt_vector_store_2,
    color_key="subtitle",
    title1="Fine-tuned BERT FAISS Embeddings",
    title2="Pretrained BERT FAISS Embeddings"
)
plt.show()
No description has been provided for this image

Analysis of PC1-PC2 Embedding SpaceΒΆ

One can see clearer grouping by subtitle in the Fine Tuned BERT's embedding space. This is presumably due to the similarity in the embedded representation induced by the document classification fine tuning task. It is worth noting that large amounts of information are lost in this visualization due to the projection from 768 dimensions to 2.