| name | doc-to-vector-dataset-generator |
| description | Converts documents into clean, chunked datasets suitable for embeddings and vector search. Produces chunked JSONL files with metadata, deduplication logic, and quality checks. Use when preparing "training data", "vector datasets", "document processing", or "embedding data". |
Doc-to-Vector Dataset Generator
Transform documents into high-quality vector search datasets.
Pipeline Steps
- Extract text from various formats (PDF, DOCX, HTML)
- Clean text (remove noise, normalize)
- Chunk strategically (semantic boundaries)
- Add metadata (source, timestamps, classification)
- Deduplicate (near-duplicate detection)
- Quality check (length, content validation)
- Export JSONL (one chunk per line)
Text Extraction
import pymupdf
def extract_pdf(filepath: str) -> str:
doc = pymupdf.open(filepath)
text = ""
for page in doc:
text += page.get_text()
return text
def extract_markdown(filepath: str) -> str:
with open(filepath) as f:
return f.read()
Text Cleaning
import re
def clean_text(text: str) -> str:
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'Page \d+', '', text)
text = re.sub(r'http\S+', '', text)
text = text.encode('utf-8', 'ignore').decode('utf-8')
return text.strip()
Semantic Chunking
def semantic_chunk(text: str, max_chunk_size: int = 1000) -> List[str]:
"""Chunk at semantic boundaries (paragraphs, sentences)"""
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for para in paragraphs:
if len(current_chunk) + len(para) <= max_chunk_size:
current_chunk += para + "\n\n"
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para + "\n\n"
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
Metadata Extraction
def extract_metadata(filepath: str, chunk: str, chunk_idx: int) -> dict:
return {
"source": filepath,
"chunk_id": f"{hash(filepath)}_{chunk_idx}",
"chunk_index": chunk_idx,
"char_count": len(chunk),
"word_count": len(chunk.split()),
"created_at": datetime.now().isoformat(),
"has_code": bool(re.search(r'```|def |class |function', chunk)),
"has_table": bool(re.search(r'\|.*\|', chunk)),
"language": detect_language(chunk),
}
Deduplication
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def deduplicate_chunks(chunks: List[dict], threshold: float = 0.95) -> List[dict]:
"""Remove near-duplicate chunks"""
texts = [c["text"] for c in chunks]
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(texts)
similarity_matrix = cosine_similarity(vectors)
to_remove = set()
for i in range(len(chunks)):
if i in to_remove:
continue
for j in range(i+1, len(chunks)):
if similarity_matrix[i][j] > threshold:
to_remove.add(j)
return [c for i, c in enumerate(chunks) if i not in to_remove]
Quality Checks
def quality_check(chunk: dict) -> bool:
"""Validate chunk quality"""
text = chunk["text"]
if len(text) < 50:
return False
if len(text) > 5000:
return False
alpha_ratio = sum(c.isalpha() for c in text) / len(text)
if alpha_ratio < 0.5:
return False
if chunk["metadata"]["language"] != "en":
return False
return True
JSONL Export
import json
def export_jsonl(chunks: List[dict], output_path: str):
"""Export chunks as JSONL (one JSON object per line)"""
with open(output_path, 'w') as f:
for chunk in chunks:
f.write(json.dumps(chunk) + '\n')
{
"text": "Chunk text content here...",
"metadata": {
"source": "docs/auth.md",
"chunk_id": "abc123_0",
"chunk_index": 0,
"char_count": 542,
"word_count": 89,
"has_code": true
}
}
Complete Pipeline
def process_documents(input_dir: str, output_path: str):
all_chunks = []
for filepath in glob(f"{input_dir}/**/*.md"):
text = extract_markdown(filepath)
text = clean_text(text)
chunks = semantic_chunk(text)
for i, chunk in enumerate(chunks):
chunk_obj = {
"text": chunk,
"metadata": extract_metadata(filepath, chunk, i)
}
if quality_check(chunk_obj):
all_chunks.append(chunk_obj)
unique_chunks = deduplicate_chunks(all_chunks)
export_jsonl(unique_chunks, output_path)
print(f"Processed {len(unique_chunks)} chunks")
Best Practices
- Chunk at semantic boundaries
- Rich metadata for filtering
- Deduplicate aggressively
- Quality checks prevent garbage
- JSONL format for streaming
- Version your datasets
Output Checklist