Spaces:

Ahmadzei
/

RAG

Runtime error

added 3 more tables for large emb model

5fa1a76 over 1 year ago

1.94 kB

	Here is how you can create a function to truncate and map the start and end tokens of the answer to the context:

	def preprocess_function(examples):
	questions = [q.strip() for q in examples["question"]]
	inputs = tokenizer(
	questions,
	examples["context"],
	max_length=384,
	truncation="only_second",
	return_offsets_mapping=True,
	padding="max_length",
	)

	offset_mapping = inputs.pop("offset_mapping")
	answers = examples["answers"]
	start_positions = []
	end_positions = []
	for i, offset in enumerate(offset_mapping):
	answer = answers[i]
	start_char = answer["answer_start"][0]
	end_char = answer["answer_start"][0] + len(answer["text"][0])
	sequence_ids = inputs.sequence_ids(i)
	# Find the start and end of the context
	idx = 0
	while sequence_ids[idx] != 1:
	idx += 1
	context_start = idx
	while sequence_ids[idx] == 1:
	idx += 1
	context_end = idx - 1
	# If the answer is not fully inside the context, label it (0, 0)
	if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
	start_positions.append(0)
	end_positions.append(0)
	else:
	# Otherwise it's the start and end token positions
	idx = context_start
	while idx <= context_end and offset[idx][0] <= start_char:
	idx += 1
	start_positions.append(idx - 1)
	idx = context_end
	while idx >= context_start and offset[idx][1] >= end_char:
	idx -= 1
	end_positions.append(idx + 1)
	inputs["start_positions"] = start_positions
	inputs["end_positions"] = end_positions
	return inputs

	To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] function.