Publications

2017

  • J. Escobar-Avila, E. Parra, and S. Haiduc, “Text Retrieval-based Tagging of Software Engineering Video Tutorials,” in Proceedings of the 39th ACM/IEEE International Conference on Software Engineering (ICSE’17), Buenos Aires, Argentina, 2017, p. To appear.
    [Bibtex]
    @InProceedings{Escobar-Avila2017,
    author = {Escobar-Avila, Javier and Parra, Esteban and Haiduc, Sonia},
    title = {Text {Retrieval-based} {Tagging} of {Software} {Engineering} {Video} {Tutorials}},
    booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'17)},
    year = {2017},
    pages = {To appear},
    address = {Buenos Aires, Argentina},
    month = {May 20-28},
    publisher = {ACM},
    abstract = {Video tutorials are an emerging form of documentation in software engineering and can efficiently provide
    developers with useful information needed for their daily tasks.
    However, to get the information they need, developers have to find the right tutorial for their task at hand. Currently, there is little information available to quickly judge whether a tutorial
    is relevant to a topic or helpful for task at hand, which can
    lead to missing the best tutorials and wasting time watching
    irrelevant ones. We present the first efforts towards new tagging
    approaches using text retrieval that describe the contents of
    software engineering video tutorials, making it easier and faster
    to understand their purpose and contents. We also present the
    results of a preliminary evaluation of thirteen such approaches,
    revealing the potential of some and limitations of others. Our
    future work will focus on improving on the promising approaches
    determined in this preliminary study and supplementing them
    with additional information.},
    keywords = {Software engineering; video tutorials; tagging; text retrieval},
    timestamp = {2017.02.12},
    }
  • C. Mills and S. Haiduc, “A Machine Learning Approach for Determining the Validity of Traceability Links,” in Proceedings of the 39th ACM/IEEE International Conference on Software Engineering (ICSE’17), Buenos Aires, Argentina, 2017, p. To appear.
    [Bibtex]
    @InProceedings{Mills2017a,
    author = {Mills, Chris and Haiduc, Sonia},
    title = {A {Machine} {Learning} {Approach} for {Determining} the {Validity} of {Traceability} {Links}},
    booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'17)},
    year = {2017},
    pages = {To appear},
    address = {Buenos Aires, Argentina},
    month = {May 20-28},
    publisher = {ACM},
    timestamp = {2017.02.12},
    }
  • C. Mills and S. Haiduc, “The Impact of Retrieval Direction on IR-based Traceability Link Recovery,” in Proceedings of the 39th ACM/IEEE International Conference on Software Engineering (ICSE’17), Buenos Aires, Argentina, 2017, p. To appear.
    [Bibtex]
    @InProceedings{Mills2017b,
    author = {Mills, Chris and Haiduc, Sonia},
    title = {The {Impact} of {Retrieval} {Direction} on {IR}-based {Traceability} {Link} {Recovery}},
    booktitle = {Proceedings of the 39th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'17)},
    year = {2017},
    pages = {To appear},
    address = {Buenos Aires, Argentina},
    month = {May 20-28},
    publisher = {ACM},
    timestamp = {2017.02.12},
    }

2016

  • [DOI] S. Haiduc, V. Arnaoudova, A. Marcus, and G. Antoniol, “The Use of Text Retrieval and Natural Language Processing in Software Engineering,” in Proceedings of the 38th ACM/IEEE International Conference on Software Engineering (ICSE’16), Austin, TX, USA, 2016, pp. 898-899.
    [Bibtex]
    @InProceedings{Haiduc2016,
    Title = {The {Use} of {Text} {Retrieval} and {Natural} {Language} {Processing} in {Software} {Engineering}},
    Author = {Haiduc, Sonia and Arnaoudova, Venera and Marcus, Adrian and Antoniol, G.},
    Booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'16)},
    Year = {2016},
    Address = {Austin, TX, USA},
    Pages = {898-899},
    Publisher = {ACM},
    Abstract = {This technical briefing presents the state of the art Text Retrieval and Natural Language Processing techniques used in Software Engineering and discusses their applications in the field.},
    Doi = {10.1145/2889160.2891053},
    Keywords = {http://dx.doi.org/10.1109/ICSE.2015.301},
    Owner = {USER},
    Timestamp = {2016.05.05},
    Url = {http://dx.doi.org/10.1145/2889160.2891053}
    }
  • [DOI] S. Imminni, M. Hasan, M. Duckett, P. Sachdeva, S. Karmakar, P. Kumar, and S. Haiduc, “Spyse: a Semantic Search Engine for Python Packages and Modules,” in Proceedings of the 38th ACM/IEEE International Conference on Software Engineering (ICSE’16), Austin, TX, USA, 2016, pp. 625-628.
    [Bibtex]
    @InProceedings{Imminni2016,
    Title = {SPYSE: A {Semantic} {Search} {Engine} for {Python} {Packages} and {Modules}},
    Author = {Imminni, S. and Hasan, M. and Duckett, M. and Sachdeva, P. and Karmakar, S. and Kumar, P. and Haiduc, Sonia},
    Booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'16)},
    Year = {2016},
    Address = {Austin, TX, USA},
    Pages = {625-628},
    Publisher = {ACM},
    Abstract = {Code reuse is a common practice among software developers, whether novices or experts. Developers often rely on online resources in order to find code to reuse. For Python, the Python Package Index (PyPI) contains all packages developed for the community and is the largest catalog of reusable, open source packages developers can consult. While a valuable resource, the state of the art PyPI search has very limited capabilities, making it hard for developers to find useful, high quality Python code to use for their task at hand.
    We introduce SPYSE (Semantic PYthon Search Engine), a web-based search engine that overcomes the limitations of the state of the art, making it easier for developers to find useful code. The power of SPYSE lays in the combination of three different aspects meant to provide developers with relevant, and at the same time high quality code: code semantics, popularity, and code quality. SPYSE also allows searching for modules, in addition to packages, which opens new reuse opportunities for developers, currently not supported. TOOL URL: https://pypi.compgeom.com VIDEO URL: https://youtu.be/Praglw-vS50},
    Doi = {10.1145/2889160.2889174},
    Owner = {USER},
    Timestamp = {2016.05.05},
    Url = {http://dx.doi.org/10.1145/2889160.2889174}
    }
  • [DOI] E. Parra, S. Haiduc, and R. James, “Making a Difference: An Overview of Humanitarian Free Open Source Systems,” in Proceedings of the 38th ACM/IEEE International Conference on Software Engineering (ICSE’16), Austin, TX, USA, 2016, pp. 731-733.
    [Bibtex]
    @InProceedings{Parra2016,
    Title = {Making a {Difference}: {An} {Overview} of {Humanitarian} {Free} {Open} {Source} {Systems}},
    Author = {Parra, Esteban and Haiduc, Sonia and James, Rebecca},
    Booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'16)},
    Year = {2016},
    Address = {Austin, TX, USA},
    Pages = {731-733},
    Publisher = {ACM},
    Abstract = {Humanitarian Free Open Source Software (HFOSS) serves philanthropic goals that usually benefit non-profit organizations meant to improve the human condition. The altruistic goals these systems serve can offer developers additional motivation for contributing to OSS and have been seen as a way to attract more women to computing majors and to improve students' learning.
    We present an exploratory study of the currently existing HFOSS projects, aimed at giving an overview of their properties, including the most common application domains and the most popular programming languages used in this kind of systems. We also investigated the assumption that HFOSS systems attract more women developers and found the assumption to be incorrect.},
    Doi = {10.1145/2889160.2892651},
    Owner = {USER},
    Timestamp = {2016.05.05},
    Url = {http://dx.doi.org/10.1145/2889160.2892651}
    }
  • [DOI] L. Ponzanelli, G. Bavota, A. Mocci, Di Penta M. Oliveto, B. R. Russo, S. Haiduc, and M. Lanza, “CodeTube: Extracting Relevant Fragments from Software Development Video Tutorials,” in Proceedings of the 38th ACM/IEEE International Conference on Software Engineering (ICSE’16), Austin, TX, USA, 2016, pp. 645-648.
    [Bibtex]
    @InProceedings{Ponzanelli2016,
    Title = {{CodeTube}: {Extracting} {Relevant} {Fragments} from {Software} {Development} {Video} {Tutorials}},
    Author = {Ponzanelli, L. and Bavota, G. and Mocci, A. and Di Penta, M. Oliveto, and R. Russo, B. and Haiduc, Sonia and Lanza, M.},
    Booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'16)},
    Year = {2016},
    Address = {Austin, TX, USA},
    Pages = {645-648},
    Publisher = {ACM},
    Abstract = {Nowadays developers heavily rely on sources of informal documentation, including Q&A forums, slides, or video tutorials, the latter being particularly useful to provide introductory notions for a piece of technology. The current practice is that developers have to browse sources individually, which in the case of video tutorials is cumbersome, as they are lengthy and cannot be searched based on their contents.
    We present CodeTube, a Web-based recommender system that analyzes the contents of video tutorials and is able to provide, given a query, cohesive and self-contained video fragments, along with links to relevant Stack Overflow discussions. CodeTube relies on a combination of textual analysis and image processing applied on video tutorial frames and speech transcripts to split videos into cohesive fragments, index them and identify related Stack Overflow discussions.
    DEMO URL: http://codetube.inf.usi.ch
    VIDEO URL: https://youtu.be/yUsUG3g87Dg},
    Doi = {10.1145/2889160.2889172},
    Owner = {USER},
    Timestamp = {2016.05.05},
    Url = {http://dx.doi.org/10.1145/2889160.2889172}
    }
  • [DOI] L. Ponzanelli, G. Bavota, A. Mocci, M. Di Penta, R. Oliveto, M. Hasan, B. Russo, S. Haiduc, and M. Lanza, “Too Long; Didn’t Watch! Extracting Relevant Fragments from Software Development Video Tutorials,” in Proceedings of the 38th ACM/IEEE International Conference on Software Engineering (ICSE’16), Austin, TX, USA, 2016, pp. 261-272.
    [Bibtex]
    @InProceedings{Ponzanelli2016b,
    Title = {Too {Long}; {Didn't} {Watch}! {Extracting} {Relevant} {Fragments} from {Software} {Development} {Video} {Tutorials}},
    Author = {Ponzanelli, L. and Bavota, G. and Mocci, A. and Di Penta, M. and Oliveto, R. and Hasan, M. and Russo, B. and Haiduc, Sonia and Lanza, M.},
    Booktitle = {Proceedings of the 38th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'16)},
    Year = {2016},
    Address = {Austin, TX, USA},
    Pages = {261-272},
    Publisher = {ACM},
    Abstract = {When knowledgeable colleagues are not available, developers resort to offline and online resources, e.g., tutorials, mailing lists, and Q&A websites. These, however, need to be found, read, and understood, which takes its toll in terms of time and mental energy. A more immediate and accessible resource are video tutorials found on the web, which in recent years have seen a steep increase in popularity. Nonetheless, videos are an intrinsically noisy data source, and finding the right piece of information might be even more cumbersome than using the previously mentioned resources.
    We present CodeTube, an approach which mines video tutorials found on the web, and enables developers to query their contents. The video tutorials are split into coherent fragments, to return only fragments related to the query. These are complemented with information from additional sources, such as Stack Overflow discussions. The results of two studies to assess CodeTube indicate that video tutorials---if appropriately processed---represent a useful, yet still under-utilized source of information for software development.},
    Doi = {10.1145/2884781.2884824},
    Owner = {USER},
    Timestamp = {2016.05.05},
    Url = {http://dx.doi.org/10.1145/2884781.2884824}
    }

2015

  • V. Arnaoudova, S. Haiduc, A. Marcus, and G. Antoniol, “The Use of Text Retrieval and Natural Language Processing in Software Engineering,” in ACM Conference on Systems, Programming, Languages and Applications: Software for Humanity (SPLASH’15), Pittsburgh, PA, USA, 2015.
    [Bibtex]
    @InProceedings{Arnaoudova2015,
    Title = {The {Use} of {Text} {Retrieval} and {Natural} {Language} {Processing} in {Software} {Engineering}},
    Author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and Antoniol, G.},
    Booktitle = {{ACM} {Conference} on {Systems}, {Programming}, {Languages} and {Applications}: {Software} for {Humanity} ({SPLASH}'15)},
    Year = {2015},
    Address = {Pittsburgh, PA, USA},
    Month = oct,
    Keywords = {information retrieval, Software engineering},
    Owner = {USER},
    Timestamp = {2015.09.18}
    }
  • V. Arnaoudova, S. Haiduc, A. Marcus, and G. Antoniol, “The Use of Text Retrieval and Natural Language Processing in Software Engineering,” in Proceedings of the 10th Joint Meeting of the European Software Engineering Conference and the ACM/SIGSOFT Symposium on the Foundations of Software Engineering (ESEC/FSE’15), Bergamo, Italy, 2015.
    [Bibtex]
    @InProceedings{Arnaoudova2015a,
    Title = {The {Use} of {Text} {Retrieval} and {Natural} {Language} {Processing} in {Software} {Engineering}},
    Author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and Antoniol, G.},
    Booktitle = {Proceedings of the 10th {Joint} {Meeting} of the {European} {Software} {Engineering} {Conference} and the {ACM/SIGSOFT} {Symposium} on the {Foundations} of {Software} {Engineering} ({ESEC/FSE}'15)},
    Year = {2015},
    Address = {Bergamo, Italy},
    Month = oct,
    Keywords = {information retrieval, Software engineering},
    Owner = {USER},
    Timestamp = {2015.09.18}
    }
  • [DOI] V. Arnaoudova, S. Haiduc, A. Marcus, and G. Antoniol, “The Use of Text Retrieval and Natural Language Processing in Software Engineering,” in Proceedings of the 37th ACM/IEEE International Conference on Software Engineering, Florence, Italy, 2015, pp. 949-950.
    [Bibtex]
    @InProceedings{Arnaoudova2015b,
    Title = {The {Use} of {Text} {Retrieval} and {Natural} {Language} {Processing} in {Software} {Engineering}},
    Author = {Arnaoudova, V. and Haiduc, S. and Marcus, A. and Antoniol, G.},
    Booktitle = {Proceedings of the 37th {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering}},
    Year = {2015},
    Address = {Florence, Italy},
    Month = may,
    Pages = {949-950},
    Doi = {10.1109/ICSE.2015.301},
    Keywords = {information retrieval, Software engineering},
    Owner = {USER},
    Timestamp = {2015.09.18},
    Url = {http://dx.doi.org/10.1109/ICSE.2015.301}
    }
  • J. Escobar-Avila, “Automatic Categorization of Software Libraries Using Bytecode,” in Proceedings of the 37th IEEE/ACM International Conference on Software Engineering (ICSE’15), Florence, Italy, 2015, pp. 784-786.
    [Bibtex]
    @InProceedings{Escobar-Avila2015a,
    Title = {Automatic {Categorization} of {Software} {Libraries} {Using} {Bytecode}},
    Author = {Escobar-Avila, Javier},
    Booktitle = {Proceedings of the 37th {IEEE}/{ACM} {International} {Conference} on {Software} {Engineering} ({ICSE}'15)},
    Year = {2015},
    Address = {Florence, Italy},
    Pages = {784--786},
    Publisher = {IEEE},
    Series = {ICSE '15},
    Abstract = {Automatic software categorization is the task of assigning categories or tags to software libraries in order to summarize their functionality. Correctly assigning these categories is essential to ensure that relevant libraries can be easily retrieved by developers from large repositories. Current categorization approaches rely on the semantics reflected in the source code, or use supervised machine learning techniques, which require a set of labeled software as a training data. These approaches fail when such information is not available. We propose a novel unsupervised approach for the automatic categorization of Java libraries, which uses the bytecode of a library in order to determine its category. We show that the approach is able to successfully categorize libraries from the Apache Foundation Repository.},
    Acmid = {2819167},
    Location = {Florence, Italy},
    Numpages = {3},
    Owner = {USER},
    Timestamp = {2016.09.27},
    Url = {http://dl.acm.org/citation.cfm?id=2819009.2819167}
    }
  • [DOI] J. Escobar-Avila, M. Linares-Vasquez, and S. Haiduc, “Unsupervised Software Categorization Using Bytecode,” in Proceedings of the 23rd IEEE International Conference on Program Comprehension (ICPC), Florence, Italy, 2015, pp. 229-239.
    [Bibtex]
    @InProceedings{Escobar-Avila2015b,
    author = {Escobar-Avila, Javier and Linares-Vasquez, Mario and Haiduc, Sonia},
    title = {Unsupervised {Software} {Categorization} {Using} {Bytecode}},
    booktitle = {Proceedings of the 23rd {IEEE} {International} {Conference} on {Program} {Comprehension} ({ICPC})},
    year = {2015},
    pages = {229--239},
    address = {Florence, Italy},
    month = may,
    abstract = {Video tutorials are an emerging form of documentation in software engineering and can efficiently provide developers with useful information needed for their daily tasks. However, to get the information they need, developers have to find the right tutorial for their task at hand. Currently, there is little information available to quickly judge whether a tutorial is relevant to a topic or helpful for task at hand, which can lead to missing the best tutorials and wasting time watching irrelevant ones.
    We present the first efforts towards new tagging approaches using text retrieval that describe the contents of software engineering video tutorials, making it easier and faster to understand their purpose and contents. We also present the results of a preliminary evaluation of thirteen such approaches, revealing the potential of some and limitations of others. Our future work will focus on improving on the promising approaches determined in this preliminary study and supplementing them with additional information.},
    doi = {10.1109/ICPC.2015.33},
    keywords = {Accuracy, bytecode, clustering, Clustering algorithms, Data mining, dirichlet process, Java, Software, software categorization, Software libraries, software profiles},
    owner = {USER},
    timestamp = {2016.08.31},
    }
  • S. Haiduc and G. Bavota, “5th Workshop on Mining Unstructured Data,” in Proceedings of the 31st International Conference on Software Maintenance and Evolution (ICSME’15), Bremen, Germany, 2015, pp. 1-6.
    [Bibtex]
    @InProceedings{Haiduc2015,
    Title = {5th {Workshop} on {Mining} {Unstructured} {Data}},
    Author = {Haiduc, S. and Bavota, G.},
    Booktitle = {Proceedings of the 31st {International} {Conference} on {Software} {Maintenance} and {Evolution} ({ICSME}'15)},
    Year = {2015},
    Address = {Bremen, Germany},
    Pages = {1-6},
    Keywords = {Conferences, Data mining, Educational institutions, information retrieval, Multiuser detection, Software, Software engineering},
    Owner = {USER},
    Timestamp = {2015.09.18}
    }
  • [DOI] L. Moreno, G. Bavota, S. Haiduc, M. Di Penta, R. Oliveto, B. Russo, and A. Marcus, “Query-based Configuration of Text Retrieval Solutions for Software Engineering Tasks,” in Proceedings of the 10th Joint Meeting of the European Software Engineering Conference and the ACM/SIGSOFT Symposium on the Foundations of Software Engineering (ESEC/FSE’15), Bergamo, Italy, 2015, pp. 567-578.
    [Bibtex]
    @InProceedings{Moreno2015,
    Title = {Query-based {Configuration} of {Text} {Retrieval} {Solutions} for {Software} {Engineering} {Tasks}},
    Author = {Moreno, Laura and Bavota, Gabriele and Haiduc, Sonia and Di Penta, Massimiliano and Oliveto, Rocco and Russo, Barbara and Marcus, Andrian},
    Booktitle = {Proceedings of the 10th {Joint} {Meeting} of the {European} {Software} {Engineering} {Conference} and the {ACM/SIGSOFT} {Symposium} on the {Foundations} of {Software} {Engineering} ({ESEC/FSE}'15)},
    Year = {2015},
    Address = {Bergamo, Italy},
    Pages = {567--578},
    Publisher = {ACM},
    Series = {{ESEC}/{FSE} 2015},
    Abstract = {Text Retrieval (TR) approaches have been used to leverage the textual information contained in software artifacts to address a multitude of software engineering (SE) tasks. However, TR approaches need to be configured properly in order to lead to good results. Current approaches for automatic TR configuration in SE configure a single TR approach and then use it for all possible queries. In this paper, we show that such a configuration strategy leads to suboptimal results, and propose QUEST, the first approach bringing TR configuration selection to the query level. QUEST recommends the best TR configuration for a given query, based on a supervised learning approach that determines the TR configuration that performs the best for each query according to its properties. We evaluated QUEST in the context of feature and bug localization, using a data set with more than 1,000 queries. We found that QUEST is able to recommend one of the top three TR configurations for a query with a 69\% accuracy, on average. We compared the results obtained with the configurations recommended by QUEST for every query with those obtained using a single TR configuration for all queries in a system and in the entire data set. We found that using QUEST we obtain better results than with any of the considered TR configurations.},
    Doi = {10.1145/2786805.2786859},
    ISBN = {978-1-4503-3675-8},
    Keywords = {configuration, Feature and Bug Localization, Text-Retrieval in Software Engineering},
    Owner = {USER},
    Timestamp = {2016.08.31},
    Url = {http://doi.acm.org/10.1145/2786805.2786859},
    Urldate = {2015-09-18}
    }

2014

  • [DOI] S. Haiduc, “Supporting Query Formulation for Text Retrieval Applications in Software Engineering,” in Proceedings of the 30th IEEE International Conference on Software Maintenance and Evolution (ICSME’14), Victoria, Canada, 2014, pp. 657-662.
    [Bibtex]
    @InProceedings{Haiduc2014,
    Title = {Supporting {Query} {Formulation} for {Text} {Retrieval} {Applications} in {Software} {Engineering}},
    Author = {Haiduc, S.},
    Booktitle = {Proceedings of the 30th {IEEE} {International} {Conference} on {Software} {Maintenance} and {Evolution} ({ICSME}'14)},
    Year = {2014},
    Address = {Victoria, Canada},
    Month = sep,
    Pages = {657--662},
    Abstract = {Text Retrieval (TR) techniques have been successfully used to leverage the textual information found in software artifacts with the purpose of aiding developers with their daily tasks. TR techniques require a query as input and the usefulness of the results they retrieve depends greatly on this query. While some queries retrieve relevant information for the current task, others do not, therefore pointing developers in the wrong direction. Developers have a hard time realizing this before going through the search results, which, in the case of "bad" queries means time and effort lost looking at irrelevant information. In this scenario, developers have to reformulate the query, often without pointers on how to improve it. The work presented in this paper introduces novel approaches to address these challenges and makes two main contributions: 1) defines the first approach for predicting the success of a TR query in the context of SE tasks, 2) introduces automatic approaches that analyze a query and improve it by finding the most suited reformulation for it. The approaches were evaluated for the task of concept location in source code and the results of the performed studies reveal their usefulness.},
    Doi = {10.1109/ICSME.2014.117},
    Keywords = {Conferences, query, query formulation, Query Quality, Query Reformulation, relevant information retrieval, software artifacts, Software engineering, software maintenance, source code, text analysis, text retrieval, text retrieval applications, textual information, TR query prediction, TR techniques},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }

2013

  • [DOI] A. Bacchelli, N. Bettenburg, L. Guerrouj, and S. Haiduc, “3rd Workshop on Mining Unstructured Data,” in Proceedings of the 20th Working Conference on Reverse Engineering (WCRE’13), Koblenz, Germany, 2013, pp. 491-492.
    [Bibtex]
    @InProceedings{Bacchelli2013,
    Title = {3rd {Workshop} on {Mining} {Unstructured} {Data}},
    Author = {Bacchelli, Alberto and Bettenburg, Nicolas and Guerrouj, Latifa and Haiduc, Sonia},
    Booktitle = {Proceedings of the 20th {Working} {Conference} on {Reverse} {Engineering} ({WCRE}'13)},
    Year = {2013},
    Address = {Koblenz, Germany},
    Month = oct,
    Pages = {491--492},
    Abstract = {Software development knowledge resides in the source code and in a number of other artefacts produced during the development process. To extract such a knowledge, past software engineering research has extensively focused on mining the source code, i.e., the final product of the development effort. Currently, we witness an emerging trend where researchers strive to exploit the information captured in artifacts such as emails and bug reports, free-form text requirements and specifications, comments and identifiers. Being often expressed in natural language, and not having a well-defined structure, the information stored in these artifacts is defined as unstructured data. Although research communities in Information Retrieval, Data Mining and Natural Language Processing have devised techniques to deal with unstructured data, these techniques are usually limited in scope (i.e., designed for English language text found in newspaper articles) and intended for use in specific scenarios, thus failing to achieve their full potential in a software development context. The workshop on Mining Unstructured Data (MUD) aims to provide a common venue for researchers and practitioners across software engineering, information retrieval and data mining research domains, to share new approaches and emerging results in mining unstructured data.},
    Doi = {10.1109/WCRE.2013.6671333},
    Keywords = {Conferences, Data mining, Educational institutions, information retrieval, Multiuser detection, Software, Software engineering},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc, G. Bavota, A. Marcus, R. Oliveto, A. De Lucia, and T. Menzies, “Automatic Query Reformulations for Text Retrieval in Software Engineering,” in Proceedings of the 35th International Conference on Software Engineering (ICSE’13), San Francisco, CA, USA, 2013, pp. 842-851.
    [Bibtex]
    @InProceedings{Haiduc2013,
    Title = {{Automatic} {Query} {Reformulations} for {Text} {Retrieval} in {Software} {Engineering}},
    Author = {Haiduc, S. and Bavota, G. and Marcus, A. and Oliveto, R. and De Lucia, A. and Menzies, T.},
    Booktitle = {Proceedings of the 35th {International} {Conference} on {Software} {Engineering} ({ICSE}'13)},
    Year = {2013},
    Address = {San Francisco, CA, USA},
    Month = may,
    Pages = {842--851},
    Abstract = {There are more than twenty distinct software engineering tasks addressed with text retrieval (TR) techniques, such as, traceability link recovery, feature location, refactoring, reuse, etc. A common issue with all TR applications is that the results of the retrieval depend largely on the quality of the query. When a query performs poorly, it has to be reformulated and this is a difficult task for someone who had trouble writing a good query in the first place. We propose a recommender (called Refoqus) based on machine learning, which is trained with a sample of queries and relevant results. Then, for a given query, it automatically recommends a reformulation strategy that should improve its performance, based on the properties of the query. We evaluated Refoqus empirically against four baseline approaches that are used in natural language document retrieval. The data used for the evaluation corresponds to changes from five open source systems in Java and C++ and it is used in the context of TR-based concept location in source code. Refoqus outperformed the baselines and its recommendations lead to query performance improvement or preservation in 84\% of the cases (in average).},
    Doi = {10.1109/ICSE.2013.6606630},
    Keywords = {automatic query reformulation, C++, C++ language, Context, Engines, feature location, Frequency measurement, Java, learning (artificial intelligence), machine learning, Natural languages, query formulation, Query Reformulation, recommender system, Recommender Systems, Refoqus, Robustness, Software engineering, text retrieval, Traceability Link Recovery, Training, Training data},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc, G. De Rosa, G. Bavota, R. Oliveto, A. De Lucia, and A. Marcus, “Query Quality Prediction and Reformulation for Source Code Search: The Refoqus Tool,” in Proceedings of the 35th International Conference on Software Engineering (ICSE’13), San Francisco, CA, USA, 2013, pp. 1307-1310.
    [Bibtex]
    @InProceedings{Haiduc2013a,
    Title = {{Query} {Quality} {Prediction} and {Reformulation} for {Source} {Code} {Search}: {The} {Refoqus} {Tool}},
    Author = {Haiduc, S. and De Rosa, G. and Bavota, G. and Oliveto, R. and De Lucia, A. and Marcus, A.},
    Booktitle = {Proceedings of the 35th {International} {Conference} on {Software} {Engineering} ({ICSE}'13)},
    Year = {2013},
    Address = {San Francisco, CA, USA},
    Month = may,
    Pages = {1307--1310},
    Abstract = {Developers search source code frequently during their daily tasks, to find pieces of code to reuse, to find where to implement changes, etc. Code search based on text retrieval (TR) techniques has been widely used in the software engineering community during the past decade. The accuracy of the TR-based search results depends largely on the quality of the query used. We introduce Refoqus, an Eclipse plugin which is able to automatically detect the quality of a text retrieval query and to propose reformulations for it, when needed, in order to improve the results of TR-based code search. A video of Refoqus is found online at http://www.youtube.com/watch?v=UQlWGiauyk4.},
    Doi = {10.1109/ICSE.2013.6606704},
    Keywords = {Context, Eclipse plugin, Feature extraction, query formulation, Query Quality, query quality prediction, Query Reformulation, Refoqus tool, Software engineering, software maintenance, Software systems, source code search, text retrieval, text retrieval query, text retrieval technique, Training, Training data, TR-based search},
    Owner = {USER},
    Shorttitle = {Query quality prediction and reformulation for source code search},
    Timestamp = {2016.08.31}
    }
  • A. Marcus and S. Haiduc, “Text Retrieval Approaches for Concept Location in Source Code,” In Series Lecture Notes in Computer Science, vol. 7171, pp. 126-158, 2013.
    [Bibtex]
    @Article{Marcus2013,
    Title = {Text {Retrieval} {Approaches} for {Concept} {Location} in {Source} {Code}},
    Author = {Marcus, Andrian and Haiduc, Sonia},
    Journal = {in {Series} {Lecture} {Notes} in {Computer} {Science}},
    Year = {2013},
    Pages = {126--158},
    Volume = {7171},
    Booktitle = {Software {Engineering}},
    Copyright = {2013 Springer-Verlag Berlin Heidelberg},
    Editor = {Lucia, Andrea De and Ferrucci, Filomena},
    ISBN = {978-3-642-36053-4 978-3-642-36054-1},
    Keywords = {Concept location, Concern location, feature location, information retrieval, Information Storage and Retrieval, Information Systems Applications (incl. Internet), Management of Computing and Information Systems, Programming Languages, Compilers, Interpreters, Programming Techniques, Software engineering, software maintenance},
    Language = {en},
    Owner = {USER},
    Publisher = {Springer Berlin Heidelberg},
    Series = {Lecture {Notes} in {Computer} {Science}},
    Timestamp = {2016.08.31},
    Url = {http://link.springer.com/chapter/10.1007/978-3-642-36054-1_5},
    Urldate = {2015-09-18}
    }
  • [DOI] L. Moreno, W. Bandara, S. Haiduc, and A. Marcus, “On the Relationship Between the Vocabulary of Bug Reports and Source Code,” in Proceedings of the 29th IEEE International Conference on Software Maintenance (ICSM’13), Eindhoven, The Netherlands, 2013, pp. 452-455.
    [Bibtex]
    @InProceedings{Moreno2013,
    Title = {On the {Relationship} {Between} the {Vocabulary} of {Bug} {Reports} and {Source} {Code}},
    Author = {Moreno, L. and Bandara, W. and Haiduc, S. and Marcus, A.},
    Booktitle = {Proceedings of the 29th {IEEE} {International} {Conference} on {Software} {Maintenance} ({ICSM}'13)},
    Year = {2013},
    Address = {Eindhoven, The Netherlands},
    Month = {Sept},
    Pages = {452-455},
    Doi = {10.1109/ICSM.2013.70},
    ISSN = {1063-6773},
    Keywords = {information retrieval;program debugging;text analysis;TR techniques;bug descriptions;bug location techniques;bug reports;source code;text retrieval techniques;Art;Computer bugs;Data collection;Large scale integration;Software systems;Vocabulary;Bug location;source code vocabulary;text retrieval},
    Owner = {USER},
    Timestamp = {2015.09.18}
    }

2012

  • [DOI] S. Haiduc, G. Bavota, R. Oliveto, A. De Lucia, and A. Marcus, “Automatic Query Performance Assessment During the Retrieval of Software Artifacts,” in Proceedings of the 27th IEEE/ACM International Conference on Automated Software Engineering (ASE’12), Essen, Germany, 2012, pp. 90-99.
    [Bibtex]
    @InProceedings{Haiduc2012,
    Title = {{Automatic} {Query} {Performance} {Assessment} {During} the {Retrieval} of {Software} {Artifacts}},
    Author = {Haiduc, S. and Bavota, G. and Oliveto, R. and De Lucia, A. and Marcus, A.},
    Booktitle = {Proceedings of the 27th {IEEE}/{ACM} {International} {Conference} on {Automated} {Software} {Engineering} ({ASE}'12)},
    Year = {2012},
    Address = {Essen, Germany},
    Month = sep,
    Pages = {90--99},
    Abstract = {Text-based search and retrieval is used by developers in the context of many SE tasks, such as, concept location, traceability link retrieval, reuse, impact analysis, etc. Solutions for software text search range from regular expression matching to complex techniques using text retrieval. In all cases, the results of a search depend on the query formulated by the developer. A developer needs to run a query and look at the results before realizing that it needs reformulating. Our aim is to automatically assess the performance of a query before it is executed. We introduce an automatic query performance assessment approach for software artifact retrieval, which uses 21 measures from the field of text retrieval. We evaluate the approach in the context of concept location in source code. The evaluation shows that our approach is able to predict the performance of queries with 79\% accuracy, using very little training data.},
    Doi = {10.1145/2351676.2351690},
    Keywords = {automatic query performance assessment, Concept location, Query performance, query processing, regular expression matching, SE tasks, software artifact retrieval, Software engineering, software text search, source code, text analysis, text-based retrieval, text-based search, text retrieval},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc, G. Bavota, R. Oliveto, A. Marcus, and A. De Lucia, “Evaluating the Specificity of Text Retrieval Queries to Support Software Engineering Tasks,” in Proceedings of the 34th International Conference on Software Engineering (ICSE’12), Zurich, Switzerland, 2012, pp. 1273-1276.
    [Bibtex]
    @InProceedings{Haiduc2012a,
    Title = {{Evaluating} the {Specificity} of {Text} {Retrieval} {Queries} to {Support} {Software} {Engineering} {Tasks}},
    Author = {Haiduc, S. and Bavota, G. and Oliveto, R. and Marcus, A. and De Lucia, A.},
    Booktitle = {Proceedings of the 34th {International} {Conference} on {Software} {Engineering} ({ICSE}'12)},
    Year = {2012},
    Address = {Zurich, Switzerland},
    Month = jun,
    Pages = {1273--1276},
    Abstract = {Text retrieval approaches have been used to address many software engineering tasks. In most cases, their use involves issuing a textual query to retrieve a set of relevant software artifacts from the system. The performance of all these approaches depends on the quality of the given query (i.e., its ability to describe the information need in such a way that the relevant software artifacts are retrieved during the search). Currently, the only way to tell that a query failed to lead to the expected software artifacts is by investing time and effort in analyzing the search results. In addition, it is often very difficult to ascertain what part of the query leads to poor results. We propose a novel pre-retrieval metric, which reflects the quality of a query by measuring the specificity of its terms. We exemplify the use of the new specificity metric on the task of concept location in source code. A preliminary empirical study shows that our metric is a good effort predictor for text retrieval-based concept location, outperforming existing techniques from the field of natural language document retrieval.},
    Doi = {10.1109/ICSE.2012.6227101},
    Keywords = {Concept location, Context, Correlation, Entropy, information retrieval, Measurement, Natural languages, natural language text, preretrieval metric, query processing, Query Quality, Query specificity, Software, software artifacts, software engineering tasks, software metrics, source code, specificity evaluation, specificity metric, text analysis, text retrieval, text retrieval-based concept location, text retrieval queries},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }

2011

  • [DOI] S. L. Abebe, S. Haiduc, P. Tonella, and A. Marcus, “The Effect of Lexicon Bad Smells on Concept Location in Source Code,” in Proceedings of the 11th IEEE International Working Conference on Source Code Analysis and Manipulation (SCAM’11), Williamsburg, VA, USA, 2011, pp. 125-134.
    [Bibtex]
    @InProceedings{Abebe2011,
    Title = {The {Effect} of {Lexicon} {Bad} {Smells} on {Concept} {Location} in {Source} {Code}},
    Author = {Abebe, S.L. and Haiduc, S. and Tonella, P. and Marcus, A.},
    Booktitle = {Proceedings of the 11th {IEEE} {International} {Working} {Conference} on {Source} {Code} {Analysis} and {Manipulation} ({SCAM}'11)},
    Year = {2011},
    Address = {Williamsburg, VA, USA},
    Month = sep,
    Pages = {125--134},
    Abstract = {Experienced programmers choose identifier names carefully, in the attempt to convey information about the role and behavior of the labeled code entity in a concise and expressive way. In fact, during program understanding the names given to code entities represent one of the major sources of information used by developers. We conjecture that lexicon bad smells, such as, extreme contractions, inconsistent term use, odd grammatical structure, etc., can hinder the execution of maintenance tasks which rely on program understanding. We propose an approach to determine the extent of this impact and instantiate it on the task of concept location. In particular, we conducted a study on two open source software systems where we investigated how lexicon bad smells affect Information Retrieval-based concept location. In this study, the classes changed in response to past modification requests are located before and after lexicon bad smells are identified and removed from the source code. The results indicate that lexicon bad smells impact concept location when using IR-based techniques.},
    Doi = {10.1109/SCAM.2011.18},
    Keywords = {code smells, Computer bugs, Concept location, Containers, extreme contraction, Filtering, grammars, identifier name, inconsistent term use, information retrieval, labeled code entity, lexicon bad smell, lexicon bad smells, Maintenance engineering, maintenance task execution, modification request, object-oriented programming, object-oriented software system, odd grammatical structure, open source software system, program comprehension, program understanding, public domain software, reverse engineering, software lexicon, software maintenance, Software systems, source code concept location, Terminology, text retrieval},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc and A. Marcus, “On the Effect of the Query in IR-based Concept Location,” in Proceedings of the 19th IEEE International Conference on Program Comprehension (ICPC’11), Kingston, Canada, 2011, pp. 234-237.
    [Bibtex]
    @InProceedings{Haiduc2011,
    Title = {On the {Effect} of the {Query} in {IR}-based {Concept} {Location}},
    Author = {Haiduc, S. and Marcus, A.},
    Booktitle = {Proceedings of the 19th {IEEE} {International} {Conference} on {Program} {Comprehension} ({ICPC}'11)},
    Year = {2011},
    Address = {Kingston, Canada},
    Month = jun,
    Pages = {234--237},
    Abstract = {Concept location is an essential task during software maintenance and in particular program comprehension activities. One of the approaches to this task is the based on leveraging the lexical information found in the source code by means of Information Retrieval techniques. All IR-based approaches to concept location are highly dependent on the queries written by the users. An IR approach, even though good on average, might fail when the input query is poor. Currently there is no way to tell when a query leads to poor results for IR-based concept location, unless a considerable effort is put into analyzing the results after the fact. We propose an approach based on recent advances in the field of IR research, which aims at automatically determining the difficulty a query poses to an IR-based concept location technique. We plan to evaluate several models and relate them to IR performance metrics.},
    Doi = {10.1109/ICPC.2011.48},
    Keywords = {Concept location, Conferences, Correlation, Estimation, information retrieval, information retrieval techniques, IR-based concept location, IR performance metrics, lexical information, Measurement, Prediction algorithms, program comprehension, program comprehension activity, query, query processing, search, search engines, software maintenance, software metrics, software performance evaluation, source code},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc, “Automatically Detecting the Quality of the Query and its Implications in IR-Based Concept Location,” in Proceedings of the 26th IEEE/ACM International Conference on Automated Software Engineering (ASE’11), Lawrence, KS, USA, 2011, pp. 637-640.
    [Bibtex]
    @InProceedings{Haiduc2011a,
    Title = {{Automatically} {Detecting} the {Quality} of the {Query} and its {Implications} in {IR}-{Based} {Concept} {Location}},
    Author = {Haiduc, S.},
    Booktitle = {Proceedings of the 26th {IEEE}/{ACM} {International} {Conference} on {Automated} {Software} {Engineering} ({ASE}'11)},
    Year = {2011},
    Address = {Lawrence, KS, USA},
    Month = nov,
    Pages = {637--640},
    Abstract = {Concept location is an essential task during software maintenance and in particular program comprehension activities. One of the approaches to this task is based on leveraging the lexical information found in the source code by means of Information Retrieval techniques. All IR-based approaches to concept location are highly dependent on the queries written by the users. An IR approach, even though good on average, might fail when the input query is poor. Currently there is no way to tell when a query leads to poor results for IR-based concept location, unless a considerable effort is put into analyzing the results after the fact. We propose an approach based on recent advances in the field of IR research, which aims at automatically determining the difficulty a query poses to an IR-based concept location technique. We plan to evaluate several models and relate them to IR performance metrics.},
    Doi = {10.1109/ASE.2011.6100144},
    Keywords = {Concept location, Conferences, Correlation, Estimation, information retrieval, information retrieval-based concept location, lexical information, Measurement, Prediction algorithms, program comprehension, program comprehension activities, query, query processing, Query Quality, search, search engines, software maintenance, source code},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }

2010

  • [DOI] S. Haiduc, J. Aponte, and A. Marcus, “Supporting Program Comprehension with Source Code Summarization,” in Proceedings of the 32nd ACM/IEEE International Conference on Software Engineering (ICSE’10), Cape Town, South Africa, 2010, pp. 223-226.
    [Bibtex]
    @InProceedings{Haiduc2010,
    Title = {{Supporting} {Program} {Comprehension} with {Source} {Code} {Summarization}},
    Author = {Haiduc, S. and Aponte, J. and Marcus, A.},
    Booktitle = {Proceedings of the 32nd {ACM}/{IEEE} {International} {Conference} on {Software} {Engineering} ({ICSE}'10)},
    Year = {2010},
    Address = {Cape Town, South Africa},
    Month = may,
    Pages = {223--226},
    Volume = {2},
    Abstract = {One of the main challenges faced by today's developers is keeping up with the staggering amount of source code that needs to be read and understood. In order to help developers with this problem and reduce the costs associated with it, one solution is to use simple textual descriptions of source code entities that developers can grasp easily, while capturing the code semantics precisely. We propose an approach to automatically determine such descriptions, based on automated text summarization technology.},
    Doi = {10.1145/1810295.1810335},
    Keywords = {automated text summarization, code semantics, cost reduction, Large scale integration, Natural languages, program comprehension, reverse engineering, Semantics, software cost estimation, Software engineering, software maintenance, Software systems, Source code summarization, summary, Tagging, text summarization, textual description},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. Haiduc, J. Aponte, L. Moreno, and A. Marcus, “On the Use of Automated Text Summarization Techniques for Summarizing Source Code,” in Proceedings of the 17th Working Conference on Reverse Engineering (WCRE’10), Beverly, MA, USA, 2010, pp. 35-44.
    [Bibtex]
    @InProceedings{Haiduc2010a,
    Title = {On the {Use} of {Automated} {Text} {Summarization} {Techniques} for {Summarizing} {Source} {Code}},
    Author = {Haiduc, S. and Aponte, J. and Moreno, L. and Marcus, A.},
    Booktitle = {Proceedings of the 17th {Working} {Conference} on {Reverse} {Engineering} ({WCRE}'10)},
    Year = {2010},
    Address = {Beverly, MA, USA},
    Month = oct,
    Pages = {35--44},
    Abstract = {During maintenance developers cannot read the entire code of large systems. They need a way to get a quick understanding of source code entities (such as, classes, methods, packages, etc.), so they can efficiently identify and then focus on the ones related to their task at hand. Sometimes reading just a method header or a class name does not tell enough about its purpose and meaning, while reading the entire implementation takes too long. We study a solution which mitigates the two approaches, i.e., short and accurate textual descriptions that illustrate the software entities without having to read the details of the implementation. We create such descriptions using techniques from automatic text summarization. The paper presents a study that investigates the suitability of various such techniques for generating source code summaries. The results indicate that a combination of text summarization techniques is most appropriate for source code summarization and that developers generally agree with the summaries produced.},
    Doi = {10.1109/WCRE.2010.13},
    Keywords = {automated text summarization technique, Computer science, Correlation, Large scale integration, Lead, program comprehension, Semantics, software entity, software maintenance, Software systems, Source code summarization, text summarization, textual description},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }

2009

  • [DOI] S. L. Abebe, S. Haiduc, P. Tonella, and A. Marcus, “Lexicon Bad Smells in Software,” in Proceedings of the 16th Working Conference on Reverse Engineering (WCRE ’09), Little, France, 2009, pp. 95-99.
    [Bibtex]
    @InProceedings{Abebe2009,
    Title = {Lexicon {Bad} {Smells} in {Software}},
    Author = {Abebe, S.L. and Haiduc, S. and Tonella, P. and Marcus, A.},
    Booktitle = {Proceedings of the 16th {Working} {Conference} on {Reverse} {Engineering} ({WCRE} '09)},
    Year = {2009},
    Address = {Little, France},
    Month = oct,
    Pages = {95--99},
    Abstract = {We introduce the notion of "lexicon bad smell", which parallels that of "code smell" and indicates some potential lexicon construction problems that can be addressed through refactoring (e.g., renaming). We created a catalog of lexicon bad smells and we developed a publicly available suite of detectors to locate them. The paper presents a case study in which we used the detectors on two open-source systems. The study revealed the main challenges faced in detecting the lexicon bad smells.},
    Doi = {10.1109/WCRE.2009.26},
    Keywords = {Computer science, Detectors, Documentation, Face detection, lexicon construction problem, Open source software, open-source system, programming environments, Programming profession, public domain software, reverse engineering, software lexicon bad smell catalog, software maintenance, software refactoring, source code smell, Speech, Terminology},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }
  • [DOI] S. L. Abebe, S. Haiduc, A. Marcus, P. Tonella, and G. Antoniol, “Analyzing the Evolution of the Source Code Vocabulary,” in Proceedings of the 13th European Conference on Software Maintenance and Reengineering (CSMR ’09), Kaiserslautern, Germany, 2009, pp. 189-198.
    [Bibtex]
    @InProceedings{Abebe2009a,
    Title = {Analyzing the {Evolution} of the {Source} {Code} {Vocabulary}},
    Author = {Abebe, S.L. and Haiduc, S. and Marcus, A. and Tonella, P. and Antoniol, G.},
    Booktitle = {Proceedings of the 13th {European} {Conference} on {Software} {Maintenance} and {Reengineering} ({CSMR} '09)},
    Year = {2009},
    Address = {Kaiserslautern, Germany},
    Month = {March},
    Pages = {189-198},
    Doi = {10.1109/CSMR.2009.61},
    ISSN = {1534-5351},
    Keywords = {software maintenance;programming language grammar;software artifact;software systems;source code;source code vocabulary;Computer languages;Computer science;Guidelines;Information analysis;Knowledge management;Software engineering;Software maintenance;Software systems;Vocabulary;Writing;Lexicon evolution;Software vocabulary;Text mining},
    Owner = {USER},
    Timestamp = {2015.09.18}
    }
  • [DOI] G. Gay, S. Haiduc, A. Marcus, and T. Menzies, “On the Use of Relevance Feedback in IR-Based Concept Location,” in Proceedings of the 25th IEEE International Conference on Software Maintenance (ICSM’09), Edmonton, Canada, 2009, pp. 351-360.
    [Bibtex]
    @InProceedings{Gay2009,
    Title = {On the {Use} of {Relevance} {Feedback} in {{IR}}-{Based} {Concept} {Location}},
    Author = {Gay, G. and Haiduc, S. and Marcus, A. and Menzies, T.},
    Booktitle = {Proceedings of the 25th {IEEE} {International} {Conference} on {Software} {Maintenance} ({ICSM}'09)},
    Year = {2009},
    Address = {Edmonton, Canada},
    Month = sep,
    Pages = {351--360},
    Abstract = {Concept location is a critical activity during software evolution as it produces the location where a change is to start in response to a modification request, such as, a bug report or a new feature request. Lexical-based concept location techniques rely on matching the text embedded in the source code to queries formulated by the developers. The efficiency of such techniques is strongly dependent on the ability of the developer to write good queries. We propose an approach to augment information retrieval (IR) based concept location via an explicit relevance feedback (RF) mechanism. RF is a two-part process in which the developer judges existing results returned by a search and the IR system uses this information to perform a new search, returning more relevant information to the user. A set of case studies performed on open source software systems reveals the impact of RF on IR based concept location.},
    Doi = {10.1109/ICSM.2009.5306315},
    Keywords = {Computer science, Humans, information retrieval, Internet, lexical-based concept location, Open source software, query processing, query writing, Radio frequency, relevance feedback, search engines, Software engineering, software evolution, software maintenance, Software systems, State feedback},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }

2008

  • [DOI] S. Haiduc and A. Marcus, “On the Use of Domain Terms in Source Code,” in Proceedings of the 16th IEEE International Conference on Program Comprehension (ICPC’08), Amsterdam, The Netherlands, 2008, pp. 113-122.
    [Bibtex]
    @InProceedings{Haiduc2008,
    Title = {On the {Use} of {Domain} {Terms} in {Source} {Code}},
    Author = {Haiduc, S. and Marcus, A.},
    Booktitle = {Proceedings of the 16th {IEEE} {International} {Conference} on {Program} {Comprehension} ({ICPC}'08)},
    Year = {2008},
    Address = {Amsterdam, The Netherlands},
    Month = {June},
    Pages = {113--122},
    Abstract = {Information about the problem domain of the software and the solution it implements is often embedded by developers in comments and identifiers. When using software developed by others or when are new to a project, programmers know little about how domain information is reflected in the source code. Programmers often learn about the domain from external sources such as books, articles, etc. Hence, it is important to use in comments and identifiers terms that are commonly known in the domain literature, as it is likely that programmers will use such terms when searching the source code. The paper presents a case study that investigated how domain terms are used in comments and identifiers. The study focused on three research questions: (1) to what degree are domain terms found in the source code of software from a particular problem domain?; (2) which is the preponderant source of domain terms: identifiers or comments?; and (3) to what degree are domain terms shared between several systems from the same problem domain? Within the studied software, we found that in average: 42\% of the domain terms were used in the source code; 23\% of the domain terms used in the source code are present in comments only, whereas only 11\% in the identifiers alone, and there is a 63\% agreement in the use of domain terms between any two software systems.},
    Doi = {10.1109/ICPC.2008.29},
    Keywords = {Books, comments, Computer science, domain terms, Embedded software, Graphics, Graph theory, identifiers, Open source software, Programming profession, software development, Software engineering, Software libraries, Software systems, source code, Vocabulary},
    Owner = {USER},
    Timestamp = {2016.08.31}
    }