Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import os | |
import zipfile | |
import os | |
import re | |
from uuid import uuid4 | |
import pdfplumber | |
import numpy as np | |
from urllib.request import urlretrieve | |
import xml.etree.ElementTree as ET | |
import re | |
import fitz # PyMuPDF | |
def get_fitz_urls(pdf_path): | |
doc = fitz.open(pdf_path) | |
urls = [] | |
for page in doc: | |
for link in page.get_links(): | |
if 'uri' in link: | |
urls.append(link['uri']) | |
return urls | |
NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'} | |
def find_pattern_in_xml(root, pattern): | |
""" | |
Recursively search for a regex pattern in all text fields of an XML tree. | |
:param root: The root Element of the XML tree | |
:param pattern: The regex pattern to search for | |
:return: A list of matching strings | |
""" | |
matches = [] | |
regex = re.compile(pattern) | |
# Check element text | |
if root.text: | |
matches.extend(regex.findall(root.text)) | |
# Check element attributes | |
for attr_value in root.attrib.values(): | |
matches.extend(regex.findall(attr_value)) | |
# Recursively search in children | |
for child in root: | |
matches.extend(find_pattern_in_xml(child, pattern)) | |
return matches | |
def fetch_url(pdf_path): | |
if (pdf_path is None): | |
raise ValueError("Pdf has no path") | |
urls = [] | |
link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b" | |
# if (method == "plumber"): | |
full_text = "" | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
# Extract text from the page and normalize spaces | |
text = page.extract_text() | |
if text: | |
full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " " | |
# Find all URLs in the combined text | |
found_urls = re.findall(link_pattern, full_text) | |
urls.extend(found_urls) | |
# elif (method == "grobid"): | |
# paper = pdf_to_grobid(file_name) | |
# found_urls = find_pattern_in_xml(paper, link_pattern) | |
# urls.extend(found_urls) | |
# os.remove(file_name) | |
# elif (method == "fitz") | |
fitz_urls = get_fitz_urls(pdf_path) | |
urls.extend(fitz_urls) | |
urls = np.unique(urls) | |
urls = [s for s in urls if "/" in s] | |
urls = [s for s in urls if "git" in s] | |
# else: | |
# raise Exception("Method unknown") | |
return urls | |