File size: 7,315 Bytes
3943768
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import argparse
import requests
import json
from semanticscholar import SemanticScholar
import arxiv


def setup_argparse():
    parser = argparse.ArgumentParser(description="Academic Paper Search Utility")
    parser.add_argument("-q", "--query", type=str, required=True, help="Search query")
    parser.add_argument("-l", "--limit", type=int, default=10, help="Number of results to return")
    parser.add_argument("-f", "--fields", nargs='+',
                        default=['title', 'authors', 'venue', 'year', 'abstract', 'citationCount',
                                 'influentialCitationCount', 'openAccessPdf', 'tldr', 'references', 'externalIds'],
                        help="Fields to include in the results (Semantic Scholar only)")
    parser.add_argument("-s", "--sort", choices=['relevance', 'citations'], default='relevance',
                        help="Sort order for results (Semantic Scholar only)")
    parser.add_argument("-y", "--year", type=int, nargs=2, metavar=('START', 'END'),
                        help="Year range for papers (e.g., -y 2000 2023)")
    parser.add_argument("-a", "--author", type=str, help="Filter by author name")
    parser.add_argument("-v", "--verbose", action="store_true", help="Print full abstracts")
    parser.add_argument("-d", "--download", action="store_true", help="Attempt to download PDFs")
    parser.add_argument("-o", "--output_dir", type=str, default="papers", help="Output directory for downloaded PDFs")
    parser.add_argument("--output", type=str, default="papers", help="Output file name for JSON file")
    parser.add_argument("-j", "--json", action="store_true", help="Output results as JSON")
    parser.add_argument("-r", "--references", type=int, default=0,
                        help="Number of references to include (Semantic Scholar only)")
    parser.add_argument("--source", choices=['semanticscholar', 'arxiv'], default='semanticscholar',
                        help="Choose the source for paper search (default: semanticscholar)")
    return parser.parse_args()


def search_papers_semanticscholar(sch, args):
    search_kwargs = {
        'query': args.query,
        'limit': args.limit,
        'fields': args.fields,
        'sort': args.sort
    }
    if args.year:
        search_kwargs['year'] = f"{args.year[0]}-{args.year[1]}"
    if args.author:
        search_kwargs['author'] = args.author
    return sch.search_paper(**search_kwargs)


def search_papers_arxiv(args):
    search = arxiv.Search(
        query=args.query,
        max_results=args.limit,
        sort_by=arxiv.SortCriterion.Relevance,
        sort_order=arxiv.SortOrder.Descending
    )
    return list(search.results())


def print_paper_info_semanticscholar(paper, index, args):
    info = {
        "index": index,
        "title": paper.title,
        "authors": ', '.join([author.name for author in paper.authors]) if paper.authors else 'N/A',
        "venue": paper.venue,
        "year": paper.year,
        "citations": paper.citationCount,
        "influential_citations": paper.influentialCitationCount,
        "externalIds": paper.externalIds,
    }
    if paper.abstract:
        info["abstract"] = paper.abstract if args.verbose else (
            paper.abstract[:200] + "..." if len(paper.abstract) > 200 else paper.abstract)
    if paper.openAccessPdf:
        info["open_access_pdf"] = {
            "url": paper.openAccessPdf['url'],
            "status": paper.openAccessPdf['status']
        }
    if hasattr(paper, 'tldr') and paper.tldr:
        info["tldr"] = paper.tldr.text
    if args.references > 0 and hasattr(paper, 'references'):
        info["references"] = [ref.title for ref in paper.references[:args.references]]

    print_info(info, args)


def print_paper_info_arxiv(paper, index, args):
    info = {
        "index": index,
        "title": paper.title,
        "authors": ', '.join(author.name for author in paper.authors),
        "year": paper.published.year,
        "abstract": paper.summary if args.verbose else (
            paper.summary[:200] + "..." if len(paper.summary) > 200 else paper.summary),
        "arxiv_url": paper.entry_id,
        "pdf_url": paper.pdf_url,
    }
    print_info(info, args)


def print_info(info, args):
    if args.json:
        print(json.dumps(info, indent=2))
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(info, f, indent=2)
    else:
        for key, value in info.items():
            if key == "open_access_pdf":
                print(f"   Open Access PDF: {value['url']} (Status: {value['status']})")
            elif key == "references":
                print(f"   Top {len(value)} References:")
                for ref in value:
                    print(f"     - {ref}")
            else:
                print(f"   {key.capitalize()}: {value}")
        print("-" * 50)


def download_pdf_semanticscholar(paper, output_dir):
    if paper.openAccessPdf and paper.openAccessPdf['url']:
        pdf_url = paper.openAccessPdf['url']
        filename = f"{output_dir}/{paper.paperId}.pdf"
        download_pdf(pdf_url, filename)
    else:
        print("   No open access PDF available for download")


def download_pdf_arxiv(paper, output_dir):
    pdf_url = paper.pdf_url
    filename = f"{output_dir}/{paper.get_short_id()}.pdf"
    download_pdf(pdf_url, filename)


def download_pdf(pdf_url, filename):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"   PDF downloaded: {filename}")
    except requests.RequestException as e:
        print(f"   Failed to download PDF: {e}")


def main():
    args = setup_argparse()

    if args.source == 'semanticscholar':
        api_key = os.environ.get("S2_API_KEY")
        if not api_key:
            print("Warning: S2_API_KEY environment variable not set. Some features may be limited.")
        sch = SemanticScholar(api_key=api_key)
        papers = search_papers_semanticscholar(sch, args)
        print_func = print_paper_info_semanticscholar
        download_func = download_pdf_semanticscholar
    else:  # arxiv
        papers = search_papers_arxiv(args)
        print_func = print_paper_info_arxiv
        download_func = download_pdf_arxiv

    if not args.json:
        print(f"Top {args.limit} papers for query '{args.query}' from {args.source}:")
        print("-" * 50)

    if args.download:
        os.makedirs(args.output_dir, exist_ok=True)

    for i, paper in enumerate(papers, 1):
        print_func(paper, i, args)
        if args.download:
            download_func(paper, args.output_dir)
        if i == args.limit:
            break

    print("""\n\nRemember to not only use these scientific scholar paper listings,
but also use ask_question_about_documents.py to ask questions about URLs or PDF documents,
ask_question_about_image.py to ask questions about images,
or download_web_video.py to download videos, etc.
A general google or bing search might be advisable if no good results are present here or PDFs of interest are not available.
If you have not found a good response to the user's original query, continue to write executable code to do so.
""")

if __name__ == "__main__":
    main()