diff --git a/llmlingua/prompt_compressor.py b/llmlingua/prompt_compressor.py index da5e765..39cb3be 100644 --- a/llmlingua/prompt_compressor.py +++ b/llmlingua/prompt_compressor.py @@ -2161,7 +2161,7 @@ def __get_context_prob( chunk_list.append(c) dataset = TokenClfDataset( - chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len + chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len, model_name=self.model_name ) dataloader = DataLoader( dataset, batch_size=self.max_batch_size, shuffle=False, drop_last=False @@ -2339,7 +2339,7 @@ def split_string_to_words(input_string): chunk_list.append(c) dataset = TokenClfDataset( - chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len + chunk_list, tokenizer=self.tokenizer, max_len=self.max_seq_len, model_name=self.model_name ) dataloader = DataLoader( dataset, batch_size=self.max_batch_size, shuffle=False, drop_last=False diff --git a/tests/test_llmlingua2.py b/tests/test_llmlingua2.py index 25ef5fb..d600319 100644 --- a/tests/test_llmlingua2.py +++ b/tests/test_llmlingua2.py @@ -12,15 +12,15 @@ class LLMLingua2Tester(unittest.TestCase): """ PROMPT = "John: So, um, I've been thinking about the project, you know, and I believe we need to, uh, make some changes. I mean, we want the project to succeed, right? So, like, I think we should consider maybe revising the timeline.\n\nSarah: I totally agree, John. I mean, we have to be realistic, you know. The timeline is, like, too tight. You know what I mean? We should definitely extend it." - COMPRESSED_SINGLE_CONTEXT_PROMPT = "John: thinking project believe need make changes. want project succeed? consider revising timeline.\n\n Sarah agree. be realistic. timeline too tight.? extend." + COMPRESSED_SINGLE_CONTEXT_PROMPT = "John: thinking project need make changes. want project succeed? consider revising timeline.\n\n Sarah agree John. be realistic. timeline too tight.? extend." COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "John: So, I've been thinking about project believe we need to make changes. we want project to succeed, right? think we should consider maybe revising timeline." GSM8K_PROMPT = "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAngelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n\nQuestion: You can buy 4 apples or 1 watermelon for the same price. You bought 36 fruits evenly split between oranges, apples and watermelons, and the price of 1 orange is $0.50. How much does 1 apple cost if your total bill was $66?\nLet's think step by step\nIf 36 fruits were evenly split between 3 types of fruits, then I bought 36/3 = 12 units of each fruit\nIf 1 orange costs $0.50 then 12 oranges will cost $0.50 * 12 = $6\nIf my total bill was $66 and I spent $6 on oranges then I spent $66 - $6 = $60 on the other 2 fruit types.\nAssuming the price of watermelon is W, and knowing that you can buy 4 apples for the same price and that the price of one apple is A, then 1W=4A\nIf we know we bought 12 watermelons and 12 apples for $60, then we know that $60 = 12W + 12A\nKnowing that 1W=4A, then we can convert the above to $60 = 12(4A) + 12A\n$60 = 48A + 12A\n$60 = 60A\nThen we know the price of one apple (A) is $60/60= $1\nThe answer is 1" - GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie plan test 2 chapters 4 worksheets 3 hours each chapter 1.5 hours each worksheet study 4 hours day how days 10-minute break every 3 10-minute snack breaks 30 minutes lunch\n\n dedicate 3 hours 2 chapters 3 2 = 6 hours total\n worksheets 1.5 hours each worksheet 1.5 4 = 6 hours total\n 12 hours study 4 hours a day 12 / 4 = 3 days\n breaks lunch 10-minute break 12 hours 10 = 120 minutes\n 3 10-minute snack breaks 3 10 = 30 minutes\n 30 minutes lunch 120 + 30 + 30 = 180 minutes 180 / 60 = 3 extra hours\n 12 hours study + 3 hours breaks = 15 hours total\n 4 hours each day 15 / 4 = 3.75\n 4 days\nThe answer is 4" - GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "4 apples 1 watermelon 36 fruits oranges watermelons 1 orange $0.50 1 apple bill $66\n\n 36 fruits 3 36/3 = 12 units\n 1 orange $0.50 12 oranges $0.50 * 12 = $6\n total bill $66 spent $6 oranges $66 - $6 = $60 other 2\n watermelon W 4 apples one apple A 1W=4A\n 12 watermelons 12 apples $60 $60 = 12W + 12A\n $60 = 12(4A + 12A\n = 48A + 12A\n = 60A\n one apple $60/60= $1\nThe answer is 1" + GSM8K_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Question: Angelo Melanie plan test 2 chapters 4 worksheets 3 hours each chapter 1.5 hours each worksheet study 4 hours day how days 10-minute break 3 10-minute snack breaks 30 minutes lunch\n\n dedicate 3 hours 2 chapters 3 2 = 6 hours total\n worksheets 1.5 hours each worksheet 1.5 4 = 6 hours total\n 12 hours study 4 hours a day 12 / 4 = 3 days\n breaks lunch 10-minute break 12 hours 10 = 120 minutes\n 3 10-minute snack breaks 3 10 = 30 minutes\n 30 minutes lunch 120 + 30 + 30 = 180 minutes 180 / 60 = 3 extra hours\n 12 hours study + 3 hours breaks = 15 hours total\n 4 hours each day 15 / 4 = 3.75\n study 4 days\nThe answer is 4" + GSM8K_150TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "4 apples 1 watermelon price 36 fruits watermelons 1 orange $0.50 1 apple bill $66\n\n 36 fruits 3 36/3 = 12 units\n 1 orange $0.50 12 oranges $0.50 * 12 = $6\n total bill $66 spent $6 oranges $66 - $6 = $60 other 2\n watermelon W 4 apples one apple A 1W=4A\n 12 watermelons 12 apples $60 $60 = 12W + 12A\n $60 = 12(4A + 12A\n = 48A + 12A\n = 60A\n one apple $60/60= $1\nThe answer is 1" MEETINGBANK_PROMPT = "Item 28 Report from Development. Services Recommendation to declare ordinance amending the Land Use District Map from institutional to IRP 13 read and adopted as read District eight. Councilman Austin. So moved. Wonderful. And I want to ask Councilman Andrews so any member of the public that wishes to address item 28 saying none, members, cast your vote. Oh, I'm sorry, sir. I did not see you. Can we? I know this sounds picky and stupid. But this is an illogical motion because you haven't yet created ARP 13. By the way, unlike some other speakers, I will furnish you my name. I'm Joe Weinstein. I did speak last week. I do not like to come down here again to talk on the same subjects. But. There is a minor little matter. As to whether a. The proposed zoning is a good idea. And B, whether. The project, which it is intended. To permit. In fact. Meets the specifications of the zoning. I have not check that out, but someone else did raise that question and there may be some question as to whether all of the conditions of that zoning have, in fact, been met by the details of this project. This particular zoning, perhaps in the abstract, need not be a bad idea, but the way you see it realized in the project. Is not a very good idea. You could have the same density and more without destroying the usability, the usable green space that this design does. Because really, although it looks impressive from a top down view, it looks like you see plenty of green space between the buildings, that that space is pretty well wasted and useless because the buildings are high enough to pretty well shade and dominate the green space that's in that project. So I'm not saying that the density that you're going for is a bad thing. But doing it in this way doesn't work, and any zoning that just permits this without further control is not a good idea. Thank you. Okay. Thank you, sir. Members, please cast your vote. Councilman Andrew's motion carries. Next time, please. Report from Development Services recommendation to declare ordinance amending the Land Use District Map from institutional to park red and adopted as Red District eight." - MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Item 28 Report Development. Services Recommendation declare ordinance amending Land Use District Map institutional IRP 13 adopted District eight. Councilman Austin. ask Councilman Andrews public address item 28 cast vote. see?. illogical motion created ARP 13. Joe Weinstein. last week. same subjects. minor matter. proposed zoning good idea. project intended. permit Meets specifications zoning. question conditions zoning met details project. zoning not bad project. not good. same density more without destroying usability green space. green space between buildings wasted useless buildings high shade dominate green space. not density bad. doesn't work zoning permits without control not good idea. Thank you. cast vote. Councilman Andrew's motion carries. Next time.Development Services ordinance Land District Map park District." + MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT = "Item 28 Report Development. Services declare ordinance amending Land Use District Map institutional IRP 13 adopted District eight. Councilman Austin. moved ask Councilman Andrews public address item 28 cast vote. see.?. illogical motion't created ARP 13. Joe Weinstein. last week. same subjects. minor matter. proposed zoning good idea. project. Meets specifications zoning. question conditions zoning met details project. zoning not bad idea project. not good idea. same density more without destroying usability green space design. green space between buildings wasted useless buildings shade dominate green space. not density bad. doesn't work zoning permits without control not good idea. Thank you. cast vote. Councilman Andrew's motion carries. Next time.Development Services ordinance Land Use District District eight." LONGBENCH_PROMPT_LIST = [ "新闻内容:\n(服务·健康)专家提醒:寒冷气候易诱发心脑血管疾病\n新华社海口2月9日专电(张苏民、李建国)海口市疾病预防控制中心专家介绍,持续的寒冷气候是心脑血管疾病的杀手,尤其患有高血压或高血脂疾病的老人更应做好防范,防止脑中风发生。\n  在寒冷的气候环境当中要注意保暖,增添衣服,饮食以清淡为主,多食用蔬菜,忌暴食荤类。尤其过年时,切忌熬夜,平时要加强身体锻炼,劳逸结合。除此之外,冬季还是呼吸道传染病暴发和流行的季节,应该注意预防流感、麻疹、流脑、水痘等呼吸道传染病的发生。\n  专家还指出,由于寒冷气候影响,人们习惯门窗紧闭,空气不对流,一旦有传染源传入,极容易造成疾病的暴发。春节期间,一些商场或公共娱乐场所人群密集,有关单位应加强通风。(完)\n类别:医药、卫生", @@ -36,10 +36,11 @@ class LLMLingua2Tester(unittest.TestCase): "\n\n新闻内容:\n在国防教育的落实上下功夫\n在国防教育的落实上下功夫 赵荣\n 加强全民国防教育是增强国防观念和忧患意识、促进国防和军队建设的基础性工程。鉴此,在今后的实践中,要坚持以科学发展观为指导,科学谋划、创新形式、狠抓落实,使全民国防教育深入人心,扎实有效地开展下去。\n 抓好责任落实。《国防教育法》第三章第十八条规定:各地区各部门的领导人员应当依法履行组织、领导本地区、本部门开展国防教育的职责。因而,要使全民国防教育扎实有效地开展下去,各级领导和职能部门要依法负起抓好全民国防教育的责任,对本地区、本单位、本行业的国防教育,从计划安排到组织实施都要认真负责地抓好落实。\n 抓好人员落实。国防教育是面向全民的教育,它的开展必须面向全社会,而不能只针对个别地区、个别单位和个别人员。因而,各地要对一切有接受能力的公民实施国防教育,以提高全民的政治、思想和道德素质,使全体公民积极争当热爱祖国、热爱国防的好公民。\n 抓好效果落实。国防教育的开展,效果的落实极为重要。为此,教育中应着重抓好国防理论、国防精神、国防知识、国防历史、国防技能、国防法制的教育,以强化爱国精神、增长国防知识、强化国防观念。通过教育,使全体公民进一步了解我国安全面临的新形势、世界军事变革的新发展、我国国防和军队建设面临的新挑战、以及在对国防建设中应承担的义务和责任等,不断提高他们支持和关心国防建设的积极性和自觉性。\n (来源:中国国防报 发布时间: 2007-11-22 08:19)\n类别:军事", "\n\n新闻内容:\n中国又一学者当选瑞典皇家工程科学院外籍院士\n新华社北京8月20日电 北京航空航天大学中国循环经济研究中心主任、北京循环经济促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n  作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济发展”专题研究,并由联合国教科文组织发表项目研究报告创意知识经济。\n 他在中国科技和产业领域作出了多项贡献,主要包括:创意“知识经济”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济学等。\n  瑞典皇家工程科学院创建于1919年,是世界上第一个工程院,现有机械工程、电机工程等学部。该院参与相关诺贝尔奖项的提名和评审工作。目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。(完)\n类别:科学技术", ] - LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "\n 新闻内容 第38届世界贸易中心年会及经贸洽谈会\n 安那州首府新奥尔良召开。\n 易服务管理总局、新奥尔良世贸中心共同举办\n 家和地区的经贸代表团约600余人与会。 天津贸促会与天津世贸中心协\n 会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n 洽谈会”。\n 联系人:王岭 刘鹏\n 电话:022-2520231725202123\n 传真:022-25201975\n 地址:天津经济 技术开发区宏达街19号A区2楼\n类别:商业、外贸、海关\n\n\n 新闻内容\n 海口“接管”省 特殊教育 学校\n 创建于1989年的海南省特殊教育 学校原属省教育 厅直属正处级事业单位,为海南省惟一一所全日寄宿的公立特殊教育 学校。\n教育 学校之后,将继续面向全省招收视障、听障两类适龄儿童教育 布局调整教育。\n类别:教育\n\n\n 中国又一学者当选瑞典皇家工程科学院外籍院士\n 新华社北京8月20日电 北京航空航天大学中国循环经济 研究中心主任、北京循环经济 促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n 作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。 1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济 发展”专题研究经济。\n:创意“知识经济 ”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济 学等。\n 瑞典皇家工程科学院创建于1919年,是世界上第一个工程院,现有机械工程、电机工程等学部。 目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。\n类别:科学技术" + LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT = "\n 新闻内容 第38届世界贸易中心年会及经贸洽谈会\n 安那州首府新奥尔良召开。\n 易服务管理总局、新奥尔良世贸中心共同举办\n 家和地区的经贸代表团约600余人与会。 天津贸促会与天津世贸中心协\n 会将共同组织天津经贸代表团赴美国参加“世贸中心2007年年会及经贸\n 洽谈会”。\n 联系人:王岭 刘鹏\n 电话:022-2520231725202123\n 传真:022-25201975\n 地址:天津经济 技术开发区宏达街19号A区2楼\n类别:商业、外贸、海关\n\n\n 新闻内容\n 海口“接管”省 特殊教育 学校\n教育 学校原属省教育 厅直属正处级事业单位,为海南省惟一一所全日寄宿的公立特殊教育 学校。\n 我市“接管”省特殊教育 学校之后,将继续面向全省招收视障、听障两类适龄儿童教育。\n类别:教育\n\n\n 中国又一学者当选瑞典皇家工程科学院外籍院士\n 新华社北京8月20日电 北京航空航天大学中国循环经济 研究中心主任、北京循环经济 促进会会长吴季松教授,日前被瑞典皇家工程科学院全体大会选为该院外籍院士。\n 作为改革开放后首批出国访问学者之一,吴季松曾在欧洲原子能联营法国原子能委员会研究受控热核聚变,还曾任中国常驻联合国教科文组织代表团参赞衔副代表、联合国教科文组织科技部门高技术与环境顾问。 1985至1986年,主持联合国教科文组织“多学科综合研究应用于经济 发展”专题研究经济。\n:创意“知识经济 ”并将科技园区的实践介绍到中国、提出修复生态系统理论并主持制定水资源规划、创立新循环经济 学等。\n 瑞典皇家工程科学院创建于1919年,现有机械工程、电机工程等学部。 目前共有院士(含外籍院士)近1100人,来自中国的外籍院士包括宋健、徐冠华等。 (完)\n类别:科学技术" def __init__(self, *args, **kwargs): super(LLMLingua2Tester, self).__init__(*args, **kwargs) + self.maxDiff = None self.llmlingua = PromptCompressor( model_name="microsoft/llmlingua-2-xlm-roberta-large-meetingbank", device_map="cpu", @@ -121,9 +122,9 @@ def test_general_compress_prompt(self): self.MEETINGBANK_150TOKENS_COMPRESSED_SINGLE_CONTEXT_PROMPT, ) self.assertEqual(compressed_prompt["origin_tokens"], 464) - self.assertEqual(compressed_prompt["compressed_tokens"], 154) + self.assertEqual(compressed_prompt["compressed_tokens"], 156) self.assertEqual(compressed_prompt["ratio"], "3.0x") - self.assertEqual(compressed_prompt["rate"], "33.2%") + self.assertEqual(compressed_prompt["rate"], "33.6%") # Multiple Context compressed_prompt = self.llmlingua.compress_prompt( @@ -187,6 +188,6 @@ def test_general_compress_prompt(self): self.LONGBENCH_1000TOKENS_COMPRESSED_MULTIPLE_CONTEXT_PROMPT, ) self.assertEqual(compressed_prompt["origin_tokens"], 8389) - self.assertEqual(compressed_prompt["compressed_tokens"], 870) - self.assertEqual(compressed_prompt["ratio"], "9.6x") - self.assertEqual(compressed_prompt["rate"], "10.4%") + self.assertEqual(compressed_prompt["compressed_tokens"], 851) + self.assertEqual(compressed_prompt["ratio"], "9.9x") + self.assertEqual(compressed_prompt["rate"], "10.1%")