SCIgen is a program that generates random Computer Science research papers, including graphs, figures, and citations. It uses a hand-written context-free grammar to form all elements of the papers. Our aim here is to maximize amusement, rather than coherence.
One useful purpose for such a program is to auto-generate submissions to conferences that you suspect might have very low submission standards. A prime example, which you may recognize from spam in your inbox, is SCI/IIIS and its dozens of co-located conferences (check out the very broad conference description on the WMSCI 2005 website). There's also a list of known bogus conferences. Using SCIgen to generate submissions for conferences like this gives us pleasure to no end. In fact, one of our papers was accepted to SCI 2005! See Examples for more details. scigen 是一个自动的论文生成软件,利用上下文无关文法自动生成无意义的英文科学研究论文,内容包含图片、表格、流程图和参考文献等。有的scigen生成的论文在竟然被权威机构给收录!(如WMSCI 2005)本博文探讨如何进行对scigen生成的伪论文进行判别。即类似于 scigen detection 的工作
defdownload_file(url,local_filename): r = requests.get(url, stream=True) withopen(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return local_filename
savePath = 'j://unrealpdf/' url = 'http://pdos.csail.mit.edu/cgi-bin/sciredirect.cgi?author=&author=&author=&author=&author=' for i in xrange(200): try: r=requests.get(url) if r.status_code==200: soup=BeautifulSoup(r.text) link = soup.find_all('a')[1] new_link='http://scigen.csail.mit.edu'+link.get('href') filename = r.url.split('/')[-1][:-4]+'pdf' file_path=download_file(new_link,savePath + filename) printstr(i+1) + ' downloading: '+filename else: print'errors: ' +str(i) except Exception,e: print e
defdownload_file(url,local_filename): r = requests.get(url, stream=True) withopen(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return local_filename
savePath = 'j://unrealpdf/' url = 'http://pdos.csail.mit.edu/cgi-bin/sciredirect.cgi?author=&author=&author=&author=&author=' for i in xrange(200): try: r=requests.get(url) if r.status_code==200: soup=BeautifulSoup(r.text) link = soup.find_all('a')[1] new_link='http://scigen.csail.mit.edu'+link.get('href') filename = r.url.split('/')[-1][:-4]+'pdf' file_path=download_file(new_link,savePath + filename) printstr(i+1) + ' downloading: '+filename else: print'errors: ' +str(i) except Exception,e: print e
defgetTestCase(path,isReal): test_input ,fileNames = [],[] withopen(path,'r') as f: for line in f: line = line.strip() content = line.split(' ') fileName = content[0] fileNames.append(fileName) temp = [] for i in content[1:-1]: temp.append(float(i)) test_input.append(temp) return fileNames,test_input, [isReal for i in xrange(len(test_input))]
defcalFrequent(path,num,isTrain=True): frequencies = {} files = os.walk(path).next()[-1][:num] if isTrain else os.walk(path).next()[-1] for fileName in files: content = '' withopen(path + '/' + fileName, 'r') as f: content = f.read() content = separator.sub(' ', content) words = content.split(' ') curDic = {} for word in words: if word: curDic.setdefault(word, 0) curDic[word] += 1 frequencies[fileName] = curDic # dic{ filename dic{word:wordCnt} } return frequencies
defcalPaperDistance(dicA, dicB): NA = sum([cnt for word, cnt in dicA.items()]) NB = sum([cnt for word, cnt in dicB.items()]) #if NA==0 or NB==0:return 0x7ffffff rate = NA * 1.0 / NB dis = 0 for word in dicA: if word in dicB: dis = dis + fabs(dicA[word] - dicB[word] * rate) elif (dicA[word] >> 1) != 0: dis = dis + dicA[word] for word in dicB: if word notin dicA : dis = dis + dicB[word] return dis * 1.0 / (NA << 1)
trainNum = 176 trueFre = calFrequent(truePath,trainNum,isTrain=True) falseFre = calFrequent(falsePath,trainNum,isTrain=True) train_data = dict(trueFre , **falseFre) train_fileName = [name[0] for i , name inenumerate(train_data.items())]
test_data = calFrequent(testPaper,trainNum,isTrain=False) test_fileName = [name[0] for i , name inenumerate(train_data.items())]
k = 3 for fileName , wordCnt in test_data.items(): distances = [(fileTrain,calPaperDistance(wordCnt, wordCntTrain)) for fileTrain,wordCntTrain in train_data.items()] distances.sort(key=lambda x:x[1]) truePaper = falsePaper = 0 for name , dis in distances[:k]: temp_tag = Falseif name.find('scimakelatex')!=-1elseTrue if temp_tag: truePaper+=1 else: falsePaper+=1 print fileName,Trueif truePaper > falsePaper elseFalse