Léo Bourrel commited on
Commit
392758b
·
1 Parent(s): e6889e1

feat: print only first title && abstract

Browse files
Files changed (2) hide show
  1. custom_pgvector.py +5 -3
  2. utils.py +20 -0
custom_pgvector.py CHANGED
@@ -16,7 +16,7 @@ from langchain.vectorstores.base import VectorStore
16
  from pgvector.sqlalchemy import Vector
17
  from sqlalchemy import delete, text
18
  from sqlalchemy.orm import Session, declarative_base
19
-
20
 
21
  class DistanceStrategy(str, enum.Enum):
22
  """Enumerator of the Distance strategies."""
@@ -316,9 +316,9 @@ class CustomPGVector(VectorStore):
316
  Document(
317
  page_content=json.dumps(
318
  {
319
- "abstract": result["abstract"],
320
  "id": result["id"],
321
- "title": result["title"],
322
  "authors": result["authors"],
323
  "doi": result["doi"],
324
  # "halID": result["halID"],
@@ -376,6 +376,8 @@ class CustomPGVector(VectorStore):
376
  "distance",
377
  ],
378
  )
 
 
379
  results = results.to_dict(orient="records")
380
  return results
381
 
 
16
  from pgvector.sqlalchemy import Vector
17
  from sqlalchemy import delete, text
18
  from sqlalchemy.orm import Session, declarative_base
19
+ from utils import str_to_list
20
 
21
  class DistanceStrategy(str, enum.Enum):
22
  """Enumerator of the Distance strategies."""
 
316
  Document(
317
  page_content=json.dumps(
318
  {
319
+ "abstract": result["abstract"][0],
320
  "id": result["id"],
321
+ "title": result["title"][0],
322
  "authors": result["authors"],
323
  "doi": result["doi"],
324
  # "halID": result["halID"],
 
376
  "distance",
377
  ],
378
  )
379
+ results["abstract"] = results["abstract"].apply(str_to_list)
380
+ results["title"] = results["title"].apply(str_to_list)
381
  results = results.to_dict(orient="records")
382
  return results
383
 
utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def str_to_list(str_input: str) -> list[str]:
5
+ if isinstance(str_input, list):
6
+ return str_input
7
+
8
+ splits = re.split(r"', '|\", \"|', \"|\", '", str_input)
9
+ splits = [
10
+ split.removeprefix("[")
11
+ .removesuffix("]")
12
+ .removeprefix("(")
13
+ .removesuffix(")")
14
+ .removeprefix("'")
15
+ .removesuffix("'")
16
+ .removeprefix('"')
17
+ .removesuffix('"')
18
+ for split in splits
19
+ ]
20
+ return splits