File size: 3,312 Bytes
2db37b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import os
import zipfile
import os
import re
from uuid import uuid4
import pdfplumber
import numpy as np
from urllib.request import urlretrieve
import xml.etree.ElementTree as ET
import re
import fitz  # PyMuPDF

def get_fitz_urls(pdf_path):
    doc = fitz.open(pdf_path)
    urls = []

    for page in doc:
        for link in page.get_links():
            if 'uri' in link:
                urls.append(link['uri'])
    
    return urls


NAMESPACE = {'tei': 'http://www.tei-c.org/ns/1.0'}

def find_pattern_in_xml(root, pattern):
    """
    Recursively search for a regex pattern in all text fields of an XML tree.

    :param root: The root Element of the XML tree
    :param pattern: The regex pattern to search for
    :return: A list of matching strings
    """
    matches = []
    regex = re.compile(pattern)

    # Check element text
    if root.text:
        matches.extend(regex.findall(root.text))

    # Check element attributes
    for attr_value in root.attrib.values():
        matches.extend(regex.findall(attr_value))

    # Recursively search in children
    for child in root:
        matches.extend(find_pattern_in_xml(child, pattern))

    return matches

def fetch_url(pdf_path):
    if (pdf_path is None):
        raise ValueError("Pdf has no path")

    urls = []
    link_pattern = "\\b((?:doi:)?(?:https?://)?(?:(?:www\\.)?(?:[\\da-z\\.-]+)\\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\\w\\.-]*)*/?)\\b"

    # if (method == "plumber"):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from the page and normalize spaces
            text = page.extract_text()
            if text:
                full_text += text.replace("-\n", "-").replace("_\n", "_").replace("/\n", "/").replace("\n", " ") + " "
        
        # Find all URLs in the combined text
        found_urls = re.findall(link_pattern, full_text)
        urls.extend(found_urls)
    # elif (method == "grobid"):
    # paper = pdf_to_grobid(file_name)
    # found_urls = find_pattern_in_xml(paper, link_pattern)
    # urls.extend(found_urls)
    # os.remove(file_name)
    # elif (method == "fitz")
    fitz_urls = get_fitz_urls(pdf_path)
    urls.extend(fitz_urls)
    urls = np.unique(urls)
    urls = [s for s in urls if "/" in s]
    urls = [s for s in urls if "git" in s]
    # else:
    #     raise Exception("Method unknown")
    return urls