Spaces:

attilasimko
/

reproduce

Running on CPU Upgrade

Attila Simkó

big upgrade

2db37b1 6 days ago

3.31 kB

	import pandas as pd
	import os
	import zipfile
	import os
	import re
	from uuid import uuid4
	import pdfplumber
	import numpy as np
	from urllib.request import urlretrieve
	import xml.etree.ElementTree as ET
	import re
	import fitz # PyMuPDF

	def get_fitz_urls(pdf_path):
	doc = fitz.open(pdf_path)
	urls = []

	for page in doc:
	for link in page.get_links():
	if 'uri' in link:
	urls.append(link['uri'])

	return urls


	NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}

	def find_pattern_in_xml(root, pattern):
	"""
	Recursively search for a regex pattern in all text fields of an XML tree.

	:param root: The root Element of the XML tree
	:param pattern: The regex pattern to search for
	:return: A list of matching strings
	"""
	matches = []
	regex = re.compile(pattern)

	# Check element text
	if root.text:
	matches.extend(regex.findall(root.text))

	# Check element attributes
	for attr_value in root.attrib.values():
	matches.extend(regex.findall(attr_value))

	# Recursively search in children
	for child in root:
	matches.extend(find_pattern_in_xml(child, pattern))

	return matches

	def fetch_url(pdf_path):
	if (pdf_path is None):
	raise ValueError("Pdf has no path")

	urls = []
	link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})\|(?:(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}\|(?:[0-9a-fA-F]{1,4}:){1,7}:\|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}\|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}\|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}\|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}\|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}\|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})\|:(?:(?::[0-9a-fA-F]{1,4}){1,7}\|:)\|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}\|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]\|(?:2[0-4]\|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]\|(?:2[0-4]\|1{0,1}[0-9]){0,1}[0-9])\|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]\|(?:2[0-4]\|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]\|(?:2[0-4]\|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}\|[1-5][0-9]{4}\|6[0-4][0-9]{3}\|65[0-4][0-9]{2}\|655[0-2][0-9]\|6553[0-5])?(?:/[\\w\\.-])/?)\\b"

	# if (method == "plumber"):
	full_text = ""
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	# Extract text from the page and normalize spaces
	text = page.extract_text()
	if text:
	full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "

	# Find all URLs in the combined text
	found_urls = re.findall(link_pattern, full_text)
	urls.extend(found_urls)
	# elif (method == "grobid"):
	# paper = pdf_to_grobid(file_name)
	# found_urls = find_pattern_in_xml(paper, link_pattern)
	# urls.extend(found_urls)
	# os.remove(file_name)
	# elif (method == "fitz")
	fitz_urls = get_fitz_urls(pdf_path)
	urls.extend(fitz_urls)
	urls = np.unique(urls)
	urls = [s for s in urls if "/" in s]
	urls = [s for s in urls if "git" in s]
	# else:
	# raise Exception("Method unknown")
	return urls